ruby-next-parser 3.1.1.3 → 3.4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2641 +0,0 @@
1
- %%machine lex; # % fix highlighting
2
-
3
- #
4
- # === BEFORE YOU START ===
5
- #
6
- # Read the Ruby Hacking Guide chapter 11, available in English at
7
- # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
8
- #
9
- # Remember two things about Ragel scanners:
10
- #
11
- # 1) Longest match wins.
12
- #
13
- # 2) If two matches have the same length, the first
14
- # in source code wins.
15
- #
16
- # General rules of making Ragel and Bison happy:
17
- #
18
- # * `p` (position) and `@te` contain the index of the character
19
- # they're pointing to ("current"), plus one. `@ts` contains the index
20
- # of the corresponding character. The code for extracting matched token is:
21
- #
22
- # @source_buffer.slice(@ts...@te)
23
- #
24
- # * If your input is `foooooooobar` and the rule is:
25
- #
26
- # 'f' 'o'+
27
- #
28
- # the result will be:
29
- #
30
- # foooooooobar
31
- # ^ ts=0 ^ p=te=9
32
- #
33
- # * A Ragel lexer action should not emit more than one token, unless
34
- # you know what you are doing.
35
- #
36
- # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
37
- #
38
- # * If an action emits the token and transitions to another state, use
39
- # these Ragel commands:
40
- #
41
- # emit($whatever)
42
- # fnext $next_state; fbreak;
43
- #
44
- # If you perform `fgoto` in an action which does not emit a token nor
45
- # rewinds the stream pointer, the parser's side-effectful,
46
- # context-sensitive lookahead actions will break in a hard to detect
47
- # and debug way.
48
- #
49
- # * If an action does not emit a token:
50
- #
51
- # fgoto $next_state;
52
- #
53
- # * If an action features lookbehind, i.e. matches characters with the
54
- # intent of passing them to another action:
55
- #
56
- # p = @ts - 1
57
- # fgoto $next_state;
58
- #
59
- # or, if the lookbehind consists of a single character:
60
- #
61
- # fhold; fgoto $next_state;
62
- #
63
- # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
64
- # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
65
- # _will_ invoke the action `act`.
66
- #
67
- # e_something stands for "something with **e**mbedded action".
68
- #
69
- # * EOF is explicit and is matched by `c_eof`. If you want to introspect
70
- # the state of the lexer, add this rule to the state:
71
- #
72
- # c_eof => do_eof;
73
- #
74
- # * If you proceed past EOF, the lexer will complain:
75
- #
76
- # NoMethodError: undefined method `ord' for nil:NilClass
77
- #
78
-
79
- class Parser::Lexer
80
- class Next
81
-
82
- %% write data nofinal;
83
- # %
84
-
85
- ESCAPES = {
86
- ?a.ord => "\a", ?b.ord => "\b", ?e.ord => "\e", ?f.ord => "\f",
87
- ?n.ord => "\n", ?r.ord => "\r", ?s.ord => "\s", ?t.ord => "\t",
88
- ?v.ord => "\v", ?\\.ord => "\\"
89
- }.freeze
90
-
91
- REGEXP_META_CHARACTERS = Regexp.union(*"\\$()*+.<>?[]^{|}".chars).freeze
92
-
93
- attr_reader :source_buffer
94
-
95
- attr_accessor :diagnostics
96
- attr_accessor :static_env
97
- attr_accessor :force_utf32
98
-
99
- attr_accessor :cond, :cmdarg, :context, :command_start
100
-
101
- attr_accessor :tokens, :comments
102
-
103
- attr_reader :paren_nest, :cmdarg_stack, :cond_stack, :lambda_stack
104
-
105
- def initialize(version)
106
- @version = version
107
- @static_env = nil
108
- @context = nil
109
-
110
- @tokens = nil
111
- @comments = nil
112
-
113
- reset
114
- end
115
-
116
- def reset(reset_state=true)
117
- # Ragel state:
118
- if reset_state
119
- # Unit tests set state prior to resetting lexer.
120
- @cs = self.class.lex_en_line_begin
121
-
122
- @cond = StackState.new('cond')
123
- @cmdarg = StackState.new('cmdarg')
124
- @cond_stack = []
125
- @cmdarg_stack = []
126
- end
127
-
128
- @force_utf32 = false # Set to true by some tests
129
-
130
- @source_pts = nil # @source as a codepoint array
131
-
132
- @p = 0 # stream position (saved manually in #advance)
133
- @ts = nil # token start
134
- @te = nil # token end
135
- @act = 0 # next action
136
-
137
- @stack = [] # state stack
138
- @top = 0 # state stack top pointer
139
-
140
- # Lexer state:
141
- @token_queue = []
142
- @literal_stack = []
143
-
144
- @eq_begin_s = nil # location of last encountered =begin
145
- @sharp_s = nil # location of last encountered #
146
-
147
- @newline_s = nil # location of last encountered newline
148
-
149
- @num_base = nil # last numeric base
150
- @num_digits_s = nil # starting position of numeric digits
151
- @num_suffix_s = nil # starting position of numeric suffix
152
- @num_xfrm = nil # numeric suffix-induced transformation
153
-
154
- @escape_s = nil # starting position of current sequence
155
- @escape = nil # last escaped sequence, as string
156
-
157
- @herebody_s = nil # starting position of current heredoc line
158
-
159
- # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
160
- # encountered after a matching closing parenthesis.
161
- @paren_nest = 0
162
- @lambda_stack = []
163
-
164
- # After encountering the closing line of <<~SQUIGGLY_HEREDOC,
165
- # we store the indentation level and give it out to the parser
166
- # on request. It is not possible to infer indentation level just
167
- # from the AST because escape sequences such as `\ ` or `\t` are
168
- # expanded inside the lexer, but count as non-whitespace for
169
- # indentation purposes.
170
- @dedent_level = nil
171
-
172
- # If the lexer is in `command state' (aka expr_value)
173
- # at the entry to #advance, it will transition to expr_cmdarg
174
- # instead of expr_arg at certain points.
175
- @command_start = true
176
-
177
- # True at the end of "def foo a:"
178
- @in_kwarg = false
179
-
180
- # State before =begin / =end block comment
181
- @cs_before_block_comment = self.class.lex_en_line_begin
182
- end
183
-
184
- def source_buffer=(source_buffer)
185
- @source_buffer = source_buffer
186
-
187
- if @source_buffer
188
- source = @source_buffer.source
189
-
190
- if source.encoding == Encoding::UTF_8
191
- @source_pts = source.unpack('U*')
192
- else
193
- @source_pts = source.unpack('C*')
194
- end
195
-
196
- if @source_pts[0] == 0xfeff
197
- # Skip byte order mark.
198
- @p = 1
199
- end
200
- else
201
- @source_pts = nil
202
- end
203
- end
204
-
205
- def encoding
206
- @source_buffer.source.encoding
207
- end
208
-
209
- LEX_STATES = {
210
- :line_begin => lex_en_line_begin,
211
- :expr_dot => lex_en_expr_dot,
212
- :expr_fname => lex_en_expr_fname,
213
- :expr_value => lex_en_expr_value,
214
- :expr_beg => lex_en_expr_beg,
215
- :expr_mid => lex_en_expr_mid,
216
- :expr_arg => lex_en_expr_arg,
217
- :expr_cmdarg => lex_en_expr_cmdarg,
218
- :expr_end => lex_en_expr_end,
219
- :expr_endarg => lex_en_expr_endarg,
220
- :expr_endfn => lex_en_expr_endfn,
221
- :expr_labelarg => lex_en_expr_labelarg,
222
-
223
- :interp_string => lex_en_interp_string,
224
- :interp_words => lex_en_interp_words,
225
- :plain_string => lex_en_plain_string,
226
- :plain_words => lex_en_plain_string,
227
- }
228
-
229
- def state
230
- LEX_STATES.invert.fetch(@cs, @cs)
231
- end
232
-
233
- def state=(state)
234
- @cs = LEX_STATES.fetch(state)
235
- end
236
-
237
- def push_cmdarg
238
- @cmdarg_stack.push(@cmdarg)
239
- @cmdarg = StackState.new("cmdarg.#{@cmdarg_stack.count}")
240
- end
241
-
242
- def pop_cmdarg
243
- @cmdarg = @cmdarg_stack.pop
244
- end
245
-
246
- def push_cond
247
- @cond_stack.push(@cond)
248
- @cond = StackState.new("cond.#{@cond_stack.count}")
249
- end
250
-
251
- def pop_cond
252
- @cond = @cond_stack.pop
253
- end
254
-
255
- def dedent_level
256
- # We erase @dedent_level as a precaution to avoid accidentally
257
- # using a stale value.
258
- dedent_level, @dedent_level = @dedent_level, nil
259
- dedent_level
260
- end
261
-
262
- # Return next token: [type, value].
263
- def advance
264
- if @token_queue.any?
265
- return @token_queue.shift
266
- end
267
-
268
- # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
269
- klass = self.class
270
- _lex_trans_keys = klass.send :_lex_trans_keys
271
- _lex_key_spans = klass.send :_lex_key_spans
272
- _lex_index_offsets = klass.send :_lex_index_offsets
273
- _lex_indicies = klass.send :_lex_indicies
274
- _lex_trans_targs = klass.send :_lex_trans_targs
275
- _lex_trans_actions = klass.send :_lex_trans_actions
276
- _lex_to_state_actions = klass.send :_lex_to_state_actions
277
- _lex_from_state_actions = klass.send :_lex_from_state_actions
278
- _lex_eof_trans = klass.send :_lex_eof_trans
279
-
280
- pe = @source_pts.size + 2
281
- p, eof = @p, pe
282
-
283
- cmd_state = @command_start
284
- @command_start = false
285
-
286
- %% write exec;
287
- # %
288
-
289
- # Ragel creates a local variable called `testEof` but it doesn't use
290
- # it in any assignment. This dead code is here to swallow the warning.
291
- # It has no runtime cost because Ruby doesn't produce any instructions from it.
292
- if false
293
- testEof
294
- end
295
-
296
- @p = p
297
-
298
- if @token_queue.any?
299
- @token_queue.shift
300
- elsif @cs == klass.lex_error
301
- [ false, [ '$error'.freeze, range(p - 1, p) ] ]
302
- else
303
- eof = @source_pts.size
304
- [ false, [ '$eof'.freeze, range(eof, eof) ] ]
305
- end
306
- end
307
-
308
- protected
309
-
310
- def eof_codepoint?(point)
311
- [0x04, 0x1a, 0x00].include? point
312
- end
313
-
314
- def version?(*versions)
315
- versions.include?(@version)
316
- end
317
-
318
- def stack_pop
319
- @top -= 1
320
- @stack[@top]
321
- end
322
-
323
- def encode_escape(ord)
324
- ord.chr.force_encoding(@source_buffer.source.encoding)
325
- end
326
-
327
- def tok(s = @ts, e = @te)
328
- @source_buffer.slice(s...e)
329
- end
330
-
331
- def range(s = @ts, e = @te)
332
- Parser::Source::Range.new(@source_buffer, s, e)
333
- end
334
-
335
- def emit(type, value = tok, s = @ts, e = @te)
336
- token = [ type, [ value, range(s, e) ] ]
337
-
338
- @token_queue.push(token)
339
-
340
- @tokens.push(token) if @tokens
341
-
342
- token
343
- end
344
-
345
- def emit_table(table, s = @ts, e = @te)
346
- value = tok(s, e)
347
-
348
- emit(table[value], value, s, e)
349
- end
350
-
351
- def emit_do(do_block=false)
352
- if @cond.active?
353
- emit(:kDO_COND, 'do'.freeze)
354
- elsif @cmdarg.active? || do_block
355
- emit(:kDO_BLOCK, 'do'.freeze)
356
- else
357
- emit(:kDO, 'do'.freeze)
358
- end
359
- end
360
-
361
- def arg_or_cmdarg(cmd_state)
362
- if cmd_state
363
- self.class.lex_en_expr_cmdarg
364
- else
365
- self.class.lex_en_expr_arg
366
- end
367
- end
368
-
369
- def emit_comment(s = @ts, e = @te)
370
- if @comments
371
- @comments.push(Parser::Source::Comment.new(range(s, e)))
372
- end
373
-
374
- if @tokens
375
- @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ])
376
- end
377
-
378
- nil
379
- end
380
-
381
- def diagnostic(type, reason, arguments=nil, location=range, highlights=[])
382
- @diagnostics.process(
383
- Parser::Diagnostic.new(type, reason, arguments, location, highlights))
384
- end
385
-
386
- #
387
- # === LITERAL STACK ===
388
- #
389
-
390
- def push_literal(*args)
391
- new_literal = Literal.new(self, *args)
392
- @literal_stack.push(new_literal)
393
- next_state_for_literal(new_literal)
394
- end
395
-
396
- def next_state_for_literal(literal)
397
- if literal.words? && literal.backslash_delimited?
398
- if literal.interpolate?
399
- self.class.lex_en_interp_backslash_delimited_words
400
- else
401
- self.class.lex_en_plain_backslash_delimited_words
402
- end
403
- elsif literal.words? && !literal.backslash_delimited?
404
- if literal.interpolate?
405
- self.class.lex_en_interp_words
406
- else
407
- self.class.lex_en_plain_words
408
- end
409
- elsif !literal.words? && literal.backslash_delimited?
410
- if literal.interpolate?
411
- self.class.lex_en_interp_backslash_delimited
412
- else
413
- self.class.lex_en_plain_backslash_delimited
414
- end
415
- else
416
- if literal.interpolate?
417
- self.class.lex_en_interp_string
418
- else
419
- self.class.lex_en_plain_string
420
- end
421
- end
422
- end
423
-
424
- def literal
425
- @literal_stack.last
426
- end
427
-
428
- def pop_literal
429
- old_literal = @literal_stack.pop
430
-
431
- @dedent_level = old_literal.dedent_level
432
-
433
- if old_literal.type == :tREGEXP_BEG
434
- # Fetch modifiers.
435
- self.class.lex_en_regexp_modifiers
436
- else
437
- self.class.lex_en_expr_end
438
- end
439
- end
440
-
441
- # Mapping of strings to parser tokens.
442
-
443
- PUNCTUATION = {
444
- '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
445
- '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
446
- '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
447
- '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
448
- ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
449
- '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
450
- '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
451
- ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
452
- '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
453
- '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
454
- '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
455
- '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
456
- '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
457
- '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
458
- '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
459
- '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
460
- '!@' => :tBANG, '&.' => :tANDDOT, '.:' => :tMETHREF
461
- }
462
-
463
- PUNCTUATION_BEGIN = {
464
- '&' => :tAMPER, '*' => :tSTAR, '**' => :tDSTAR,
465
- '+' => :tUPLUS, '-' => :tUMINUS, '::' => :tCOLON3,
466
- '(' => :tLPAREN, '{' => :tLBRACE, '[' => :tLBRACK,
467
- }
468
-
469
- KEYWORDS = {
470
- 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
471
- 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
472
- 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
473
- 'BEGIN' => :klBEGIN, 'END' => :klEND,
474
- }
475
-
476
- KEYWORDS_BEGIN = {
477
- 'if' => :kIF, 'unless' => :kUNLESS,
478
- 'while' => :kWHILE, 'until' => :kUNTIL,
479
- 'rescue' => :kRESCUE, 'defined?' => :kDEFINED,
480
- 'BEGIN' => :klBEGIN, 'END' => :klEND,
481
- }
482
-
483
- %w(class module def undef begin end then elsif else ensure case when
484
- for break next redo retry in do return yield super self nil true
485
- false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
486
- KEYWORDS_BEGIN[keyword] = KEYWORDS[keyword] = :"k#{keyword.upcase}"
487
- end
488
-
489
- %%{
490
- # %
491
-
492
- access @;
493
- getkey (@source_pts[p] || 0);
494
-
495
- # === CHARACTER CLASSES ===
496
- #
497
- # Pay close attention to the differences between c_any and any.
498
- # c_any does not include EOF and so will cause incorrect behavior
499
- # for machine subtraction (any-except rules) and default transitions
500
- # for scanners.
501
-
502
- action do_nl {
503
- # Record position of a newline for precise location reporting on tNL
504
- # tokens.
505
- #
506
- # This action is embedded directly into c_nl, as it is idempotent and
507
- # there are no cases when we need to skip it.
508
- @newline_s = p
509
- }
510
-
511
- c_nl = '\n' $ do_nl;
512
- c_space = [ \t\r\f\v];
513
- c_space_nl = c_space | c_nl;
514
-
515
- c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
516
- c_eol = c_nl | c_eof;
517
- c_any = any - c_eof;
518
-
519
- c_nl_zlen = c_nl | zlen;
520
- c_line = any - c_nl_zlen;
521
-
522
- c_ascii = 0x00..0x7f;
523
- c_unicode = c_any - c_ascii;
524
- c_upper = [A-Z];
525
- c_lower = [a-z_] | c_unicode;
526
- c_alpha = c_lower | c_upper;
527
- c_alnum = c_alpha | [0-9];
528
-
529
- action do_eof {
530
- # Sit at EOF indefinitely. #advance would return $eof each time.
531
- # This allows to feed the lexer more data if needed; this is only used
532
- # in tests.
533
- #
534
- # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
535
- # below. This is due to the fact that scanner state at EOF is observed
536
- # by tests, and encapsulating it in a rule would break the introspection.
537
- fhold; fbreak;
538
- }
539
-
540
- #
541
- # === TOKEN DEFINITIONS ===
542
- #
543
-
544
- # All operators are punctuation. There is more to punctuation
545
- # than just operators. Operators can be overridden by user;
546
- # punctuation can not.
547
-
548
- # A list of operators which are valid in the function name context, but
549
- # have different semantics in others.
550
- operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ;
551
-
552
- # A list of operators which can occur within an assignment shortcut (+ → +=).
553
- operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
554
- '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ;
555
-
556
- # A list of all user-definable operators not covered by groups above.
557
- operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
558
- '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
559
-
560
- # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
561
- # as they are ambiguous with interpolation `#{}` and should be counted.
562
- # These braces are not present in punctuation lists.
563
-
564
- # A list of punctuation which has different meaning when used at the
565
- # beginning of expression.
566
- punctuation_begin = '-' | '+' | '::' | '(' | '[' |
567
- '*' | '**' | '&' ;
568
-
569
- # A list of all punctuation except punctuation_begin.
570
- punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
571
- '::' | '?' | ':' | '.' | '..' | '...' ;
572
-
573
- # A list of keywords which have different meaning at the beginning of expression.
574
- keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
575
-
576
- # A list of keywords which accept an argument-like expression, i.e. have the
577
- # same post-processing as method calls or commands. Example: `yield 1`,
578
- # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
579
- keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
580
-
581
- # A list of keywords which accept a literal function name as an argument.
582
- keyword_with_fname = 'def' | 'undef' | 'alias' ;
583
-
584
- # A list of keywords which accept an expression after them.
585
- keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
586
- 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
587
- 'and' | 'or' ;
588
-
589
- # A list of keywords which accept a value, and treat the keywords from
590
- # `keyword_modifier` list as modifiers.
591
- keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
592
-
593
- # A list of keywords which do not accept an expression after them.
594
- keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
595
- 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
596
- '__LINE__' | '__ENCODING__';
597
-
598
- # All keywords.
599
- keyword = keyword_with_value | keyword_with_mid |
600
- keyword_with_end | keyword_with_arg |
601
- keyword_with_fname | keyword_modifier ;
602
-
603
- constant = c_upper c_alnum*;
604
- bareword = c_alpha c_alnum*;
605
-
606
- call_or_var = c_lower c_alnum*;
607
- class_var = '@@' bareword;
608
- instance_var = '@' bareword;
609
- global_var = '$'
610
- ( bareword | digit+
611
- | [`'+~*$&?!@/\\;,.=:<>"] # `
612
- | '-' c_alnum
613
- )
614
- ;
615
-
616
- # Ruby accepts (and fails on) variables with leading digit
617
- # in literal context, but not in unquoted symbol body.
618
- class_var_v = '@@' c_alnum+;
619
- instance_var_v = '@' c_alnum+;
620
-
621
- label = bareword [?!]? ':';
622
-
623
- #
624
- # === NUMERIC PARSING ===
625
- #
626
-
627
- int_hex = ( xdigit+ '_' )* xdigit* '_'? ;
628
- int_dec = ( digit+ '_' )* digit* '_'? ;
629
- int_bin = ( [01]+ '_' )* [01]* '_'? ;
630
-
631
- flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0';
632
- flo_frac = '.' ( digit+ '_' )* digit+;
633
- flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+;
634
-
635
- int_suffix =
636
- '' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars) } }
637
- | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
638
- | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, chars)) } }
639
- | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
640
- | 're' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
641
- | 'if' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
642
- | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 6); p -= 6 } };
643
-
644
- flo_pow_suffix =
645
- '' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars)) } }
646
- | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Float(chars))) } }
647
- | 'if' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 2); p -= 2 } };
648
-
649
- flo_suffix =
650
- flo_pow_suffix
651
- | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
652
- | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
653
- | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 6); p -= 6 } };
654
-
655
- #
656
- # === ESCAPE SEQUENCE PARSING ===
657
- #
658
-
659
- # Escape parsing code is a Ragel pattern, not a scanner, and therefore
660
- # it shouldn't directly raise errors or perform other actions with side effects.
661
- # In reality this would probably just mess up error reporting in pathological
662
- # cases, through.
663
-
664
- # The amount of code required to parse \M\C stuff correctly is ridiculous.
665
-
666
- escaped_nl = "\\" c_nl;
667
-
668
- action unicode_points {
669
- @escape = ""
670
-
671
- codepoints = tok(@escape_s + 2, p - 1)
672
- codepoint_s = @escape_s + 2
673
-
674
- if @version < 24
675
- if codepoints.start_with?(" ") || codepoints.start_with?("\t")
676
- diagnostic :fatal, :invalid_unicode_escape, nil,
677
- range(@escape_s + 2, @escape_s + 3)
678
- end
679
-
680
- if spaces_p = codepoints.index(/[ \t]{2}/)
681
- diagnostic :fatal, :invalid_unicode_escape, nil,
682
- range(codepoint_s + spaces_p + 1, codepoint_s + spaces_p + 2)
683
- end
684
-
685
- if codepoints.end_with?(" ") || codepoints.end_with?("\t")
686
- diagnostic :fatal, :invalid_unicode_escape, nil, range(p - 1, p)
687
- end
688
- end
689
-
690
- codepoints.scan(/([0-9a-fA-F]+)|([ \t]+)/).each do |(codepoint_str, spaces)|
691
- if spaces
692
- codepoint_s += spaces.length
693
- else
694
- codepoint = codepoint_str.to_i(16)
695
-
696
- if codepoint >= 0x110000
697
- diagnostic :error, :unicode_point_too_large, nil,
698
- range(codepoint_s, codepoint_s + codepoint_str.length)
699
- break
700
- end
701
-
702
- @escape += codepoint.chr(Encoding::UTF_8)
703
- codepoint_s += codepoint_str.length
704
- end
705
- end
706
- }
707
-
708
- action unescape_char {
709
- codepoint = @source_pts[p - 1]
710
-
711
- if @version >= 30 && (codepoint == 117 || codepoint == 85) # 'u' or 'U'
712
- diagnostic :fatal, :invalid_escape
713
- end
714
-
715
- if (@escape = ESCAPES[codepoint]).nil?
716
- @escape = encode_escape(@source_buffer.slice(p - 1))
717
- end
718
- }
719
-
720
- action invalid_complex_escape {
721
- diagnostic :fatal, :invalid_escape
722
- }
723
-
724
- action read_post_meta_or_ctrl_char {
725
- @escape = @source_buffer.slice(p - 1).chr
726
-
727
- if @version >= 27 && ((0..8).include?(@escape.ord) || (14..31).include?(@escape.ord))
728
- diagnostic :fatal, :invalid_escape
729
- end
730
- }
731
-
732
- action slash_c_char {
733
- @escape = encode_escape(@escape[0].ord & 0x9f)
734
- }
735
-
736
- action slash_m_char {
737
- @escape = encode_escape(@escape[0].ord | 0x80)
738
- }
739
-
740
- maybe_escaped_char = (
741
- '\\' c_any %unescape_char
742
- | '\\x' xdigit{1,2} % { @escape = encode_escape(tok(p - 2, p).to_i(16)) } %slash_c_char
743
- | ( c_any - [\\] ) %read_post_meta_or_ctrl_char
744
- );
745
-
746
- maybe_escaped_ctrl_char = ( # why?!
747
- '\\' c_any %unescape_char %slash_c_char
748
- | '?' % { @escape = "\x7f" }
749
- | '\\x' xdigit{1,2} % { @escape = encode_escape(tok(p - 2, p).to_i(16)) } %slash_c_char
750
- | ( c_any - [\\?] ) %read_post_meta_or_ctrl_char %slash_c_char
751
- );
752
-
753
- escape = (
754
- # \377
755
- [0-7]{1,3}
756
- % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) }
757
-
758
- # \xff
759
- | 'x' xdigit{1,2}
760
- % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) }
761
-
762
- # %q[\x]
763
- | 'x' ( c_any - xdigit )
764
- % {
765
- diagnostic :fatal, :invalid_hex_escape, nil, range(@escape_s - 1, p + 2)
766
- }
767
-
768
- # \u263a
769
- | 'u' xdigit{4}
770
- % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
771
-
772
- # \u123
773
- | 'u' xdigit{0,3}
774
- % {
775
- diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
776
- }
777
-
778
- # u{not hex} or u{}
779
- | 'u{' ( c_any - xdigit - [ \t}] )* '}'
780
- % {
781
- diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
782
- }
783
-
784
- # \u{ \t 123 \t 456 \t\t }
785
- | 'u{' [ \t]* ( xdigit{1,6} [ \t]+ )*
786
- (
787
- ( xdigit{1,6} [ \t]* '}'
788
- %unicode_points
789
- )
790
- |
791
- ( xdigit* ( c_any - xdigit - [ \t}] )+ '}'
792
- | ( c_any - [ \t}] )* c_eof
793
- | xdigit{7,}
794
- ) % {
795
- diagnostic :fatal, :unterminated_unicode, nil, range(p - 1, p)
796
- }
797
- )
798
-
799
- # \C-\a \cx
800
- | ( 'C-' | 'c' ) escaped_nl?
801
- maybe_escaped_ctrl_char
802
-
803
- # \M-a
804
- | 'M-' escaped_nl?
805
- maybe_escaped_char
806
- %slash_m_char
807
-
808
- # \C-\M-f \M-\cf \c\M-f
809
- | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
810
- | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
811
- maybe_escaped_ctrl_char
812
- %slash_m_char
813
-
814
- | 'C' c_any %invalid_complex_escape
815
- | 'M' c_any %invalid_complex_escape
816
- | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
817
-
818
- | ( c_any - [0-7xuCMc] ) %unescape_char
819
-
820
- | c_eof % {
821
- diagnostic :fatal, :escape_eof, nil, range(p - 1, p)
822
- }
823
- );
824
-
825
- # Use rules in form of `e_bs escape' when you need to parse a sequence.
826
- e_bs = '\\' % {
827
- @escape_s = p
828
- @escape = nil
829
- };
830
-
831
- #
832
- # === STRING AND HEREDOC PARSING ===
833
- #
834
-
835
- # Heredoc parsing is quite a complex topic. First, consider that heredocs
836
- # can be arbitrarily nested. For example:
837
- #
838
- # puts <<CODE
839
- # the result is: #{<<RESULT.inspect
840
- # i am a heredoc
841
- # RESULT
842
- # }
843
- # CODE
844
- #
845
- # which, incidentally, evaluates to:
846
- #
847
- # the result is: " i am a heredoc\n"
848
- #
849
- # To parse them, lexer refers to two kinds (remember, nested heredocs)
850
- # of positions in the input stream, namely heredoc_e
851
- # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
852
- #
853
- # heredoc_e is simply contained inside the corresponding Literal, and
854
- # when the heredoc is closed, the lexing is restarted from that position.
855
- #
856
- # @herebody_s is quite more complex. First, @herebody_s changes after each
857
- # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
858
- # contains the current line, and also when a heredoc is started, @herebody_s
859
- # contains the position from which the heredoc will be lexed.
860
- #
861
- # Second, as (insanity) there are nested heredocs, we need to maintain a
862
- # stack of these positions. Each time #push_literal is called, it saves current
863
- # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
864
- # containing another heredocs) is closed, the previous value is restored.
865
-
866
- e_heredoc_nl = c_nl % {
867
- # After every heredoc was parsed, @herebody_s contains the
868
- # position of next token after all heredocs.
869
- if @herebody_s
870
- p = @herebody_s
871
- @herebody_s = nil
872
- end
873
- };
874
-
875
- action extend_string {
876
- string = tok
877
-
878
- # tLABEL_END is only possible in non-cond context on >= 2.2
879
- if @version >= 22 && !@cond.active?
880
- lookahead = @source_buffer.slice(@te...@te+2)
881
- end
882
-
883
- current_literal = literal
884
- if !current_literal.heredoc? &&
885
- (token = current_literal.nest_and_try_closing(string, @ts, @te, lookahead))
886
- if token[0] == :tLABEL_END
887
- p += 1
888
- pop_literal
889
- fnext expr_labelarg;
890
- else
891
- fnext *pop_literal;
892
- end
893
- fbreak;
894
- else
895
- current_literal.extend_string(string, @ts, @te)
896
- end
897
- }
898
-
899
- action extend_string_escaped {
900
- current_literal = literal
901
- # Get the first character after the backslash.
902
- escaped_char = @source_buffer.slice(@escape_s).chr
903
-
904
- if current_literal.munge_escape? escaped_char
905
- # If this particular literal uses this character as an opening
906
- # or closing delimiter, it is an escape sequence for that
907
- # particular character. Write it without the backslash.
908
-
909
- if current_literal.regexp? && REGEXP_META_CHARACTERS.match(escaped_char)
910
- # Regular expressions should include escaped delimiters in their
911
- # escaped form, except when the escaped character is
912
- # a closing delimiter but not a regexp metacharacter.
913
- #
914
- # The backslash itself cannot be used as a closing delimiter
915
- # at the same time as an escape symbol, but it is always munged,
916
- # so this branch also executes for the non-closing-delimiter case
917
- # for the backslash.
918
- current_literal.extend_string(tok, @ts, @te)
919
- else
920
- current_literal.extend_string(escaped_char, @ts, @te)
921
- end
922
- else
923
- # It does not. So this is an actual escape sequence, yay!
924
- if current_literal.squiggly_heredoc? && escaped_char == "\n".freeze
925
- # Squiggly heredocs like
926
- # <<~-HERE
927
- # 1\
928
- # 2
929
- # HERE
930
- # treat '\' as a line continuation, but still dedent the body, so the heredoc above becomes "12\n".
931
- # This information is emitted as is, without escaping,
932
- # later this escape sequence (\\\n) gets handled manually in the Lexer::Dedenter
933
- current_literal.extend_string(tok, @ts, @te)
934
- elsif current_literal.supports_line_continuation_via_slash? && escaped_char == "\n".freeze
935
- # Heredocs, regexp and a few other types of literals support line
936
- # continuation via \\\n sequence. The code like
937
- # "a\
938
- # b"
939
- # must be parsed as "ab"
940
- current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
941
- elsif current_literal.regexp? && @version >= 31 && %w[c C m M].include?(escaped_char)
942
- # Ruby >= 3.1 escapes \c- and \m chars, that's the only escape sequence
943
- # supported by regexes so far, so it needs a separate branch.
944
- current_literal.extend_string(@escape, @ts, @te)
945
- elsif current_literal.regexp?
946
- # Regular expressions should include escape sequences in their
947
- # escaped form. On the other hand, escaped newlines are removed (in cases like "\\C-\\\n\\M-x")
948
- current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
949
- else
950
- current_literal.extend_string(@escape || tok, @ts, @te)
951
- end
952
- end
953
- }
954
-
955
- # Extend a string with a newline or a EOF character.
956
- # As heredoc closing line can immediately precede EOF, this action
957
- # has to handle such case specially.
958
- action extend_string_eol {
959
- current_literal = literal
960
- if @te == pe
961
- diagnostic :fatal, :string_eof, nil,
962
- range(current_literal.str_s, current_literal.str_s + 1)
963
- end
964
-
965
- if current_literal.heredoc?
966
- line = tok(@herebody_s, @ts).gsub(/\r+$/, ''.freeze)
967
-
968
- if version?(18, 19, 20)
969
- # See ruby:c48b4209c
970
- line = line.gsub(/\r.*$/, ''.freeze)
971
- end
972
-
973
- # Try ending the heredoc with the complete most recently
974
- # scanned line. @herebody_s always refers to the start of such line.
975
- if current_literal.nest_and_try_closing(line, @herebody_s, @ts)
976
- # Adjust @herebody_s to point to the next line.
977
- @herebody_s = @te
978
-
979
- # Continue regular lexing after the heredoc reference (<<END).
980
- p = current_literal.heredoc_e - 1
981
- fnext *pop_literal; fbreak;
982
- else
983
- # Calculate indentation level for <<~HEREDOCs.
984
- current_literal.infer_indent_level(line)
985
-
986
- # Ditto.
987
- @herebody_s = @te
988
- end
989
- else
990
- # Try ending the literal with a newline.
991
- if current_literal.nest_and_try_closing(tok, @ts, @te)
992
- fnext *pop_literal; fbreak;
993
- end
994
-
995
- if @herebody_s
996
- # This is a regular literal intertwined with a heredoc. Like:
997
- #
998
- # p <<-foo+"1
999
- # bar
1000
- # foo
1001
- # 2"
1002
- #
1003
- # which, incidentally, evaluates to "bar\n1\n2".
1004
- p = @herebody_s - 1
1005
- @herebody_s = nil
1006
- end
1007
- end
1008
-
1009
- if current_literal.words? && !eof_codepoint?(@source_pts[p])
1010
- current_literal.extend_space @ts, @te
1011
- else
1012
- # A literal newline is appended if the heredoc was _not_ closed
1013
- # this time (see fbreak above). See also Literal#nest_and_try_closing
1014
- # for rationale of calling #flush_string here.
1015
- current_literal.extend_string tok, @ts, @te
1016
- current_literal.flush_string
1017
- end
1018
- }
1019
-
1020
- action extend_string_space {
1021
- literal.extend_space @ts, @te
1022
- }
1023
-
1024
- #
1025
- # === INTERPOLATION PARSING ===
1026
- #
1027
-
1028
- # Interpolations with immediate variable names simply call into
1029
- # the corresponding machine.
1030
-
1031
- interp_var = '#' ( global_var | class_var_v | instance_var_v );
1032
-
1033
- action extend_interp_var {
1034
- current_literal = literal
1035
- current_literal.flush_string
1036
- current_literal.extend_content
1037
-
1038
- emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
1039
-
1040
- p = @ts
1041
- fcall expr_variable;
1042
- }
1043
-
1044
- # Special case for Ruby > 2.7
1045
- # If interpolated instance/class variable starts with a digit we parse it as a plain substring
1046
- # However, "#$1" is still a regular interpolation
1047
- interp_digit_var = '#' ('@' | '@@') digit c_alpha*;
1048
-
1049
- action extend_interp_digit_var {
1050
- if @version >= 27
1051
- literal.extend_string(tok, @ts, @te)
1052
- else
1053
- message = tok.start_with?('#@@') ? :cvar_name : :ivar_name
1054
- diagnostic :error, message, { :name => tok(@ts + 1, @te) }, range(@ts + 1, @te)
1055
- end
1056
- }
1057
-
1058
- # Interpolations with code blocks must match nested curly braces, as
1059
- # interpolation ending is ambiguous with a block ending. So, every
1060
- # opening and closing brace should be matched with e_[lr]brace rules,
1061
- # which automatically perform the counting.
1062
- #
1063
- # Note that interpolations can themselves be nested, so brace balance
1064
- # is tied to the innermost literal.
1065
- #
1066
- # Also note that literals themselves should not use e_[lr]brace rules
1067
- # when matching their opening and closing delimiters, as the amount of
1068
- # braces inside the characters of a string literal is independent.
1069
-
1070
- interp_code = '#{';
1071
-
1072
- e_lbrace = '{' % {
1073
- @cond.push(false); @cmdarg.push(false)
1074
-
1075
- current_literal = literal
1076
- if current_literal
1077
- current_literal.start_interp_brace
1078
- end
1079
- };
1080
-
1081
- e_rbrace = '}' % {
1082
- current_literal = literal
1083
- if current_literal
1084
- if current_literal.end_interp_brace_and_try_closing
1085
- if version?(18, 19)
1086
- emit(:tRCURLY, '}'.freeze, p - 1, p)
1087
- @cond.lexpop
1088
- @cmdarg.lexpop
1089
- else
1090
- emit(:tSTRING_DEND, '}'.freeze, p - 1, p)
1091
- end
1092
-
1093
- if current_literal.saved_herebody_s
1094
- @herebody_s = current_literal.saved_herebody_s
1095
- end
1096
-
1097
-
1098
- fhold;
1099
- fnext *next_state_for_literal(current_literal);
1100
- fbreak;
1101
- end
1102
- end
1103
-
1104
- @paren_nest -= 1
1105
- };
1106
-
1107
- action extend_interp_code {
1108
- current_literal = literal
1109
- current_literal.flush_string
1110
- current_literal.extend_content
1111
-
1112
- emit(:tSTRING_DBEG, '#{'.freeze)
1113
-
1114
- if current_literal.heredoc?
1115
- current_literal.saved_herebody_s = @herebody_s
1116
- @herebody_s = nil
1117
- end
1118
-
1119
- current_literal.start_interp_brace
1120
- @command_start = true
1121
- fnext expr_value;
1122
- fbreak;
1123
- }
1124
-
1125
- # Actual string parsers are simply combined from the primitives defined
1126
- # above.
1127
-
1128
- interp_words := |*
1129
- interp_code => extend_interp_code;
1130
- interp_digit_var => extend_interp_digit_var;
1131
- interp_var => extend_interp_var;
1132
- e_bs escape => extend_string_escaped;
1133
- c_space+ => extend_string_space;
1134
- c_eol => extend_string_eol;
1135
- c_any => extend_string;
1136
- *|;
1137
-
1138
- interp_string := |*
1139
- interp_code => extend_interp_code;
1140
- interp_digit_var => extend_interp_digit_var;
1141
- interp_var => extend_interp_var;
1142
- e_bs escape => extend_string_escaped;
1143
- c_eol => extend_string_eol;
1144
- c_any => extend_string;
1145
- *|;
1146
-
1147
- plain_words := |*
1148
- e_bs c_any => extend_string_escaped;
1149
- c_space+ => extend_string_space;
1150
- c_eol => extend_string_eol;
1151
- c_any => extend_string;
1152
- *|;
1153
-
1154
- plain_string := |*
1155
- '\\' c_nl => extend_string_eol;
1156
- e_bs c_any => extend_string_escaped;
1157
- c_eol => extend_string_eol;
1158
- c_any => extend_string;
1159
- *|;
1160
-
1161
- interp_backslash_delimited := |*
1162
- interp_code => extend_interp_code;
1163
- interp_digit_var => extend_interp_digit_var;
1164
- interp_var => extend_interp_var;
1165
- c_eol => extend_string_eol;
1166
- c_any => extend_string;
1167
- *|;
1168
-
1169
- plain_backslash_delimited := |*
1170
- c_eol => extend_string_eol;
1171
- c_any => extend_string;
1172
- *|;
1173
-
1174
- interp_backslash_delimited_words := |*
1175
- interp_code => extend_interp_code;
1176
- interp_digit_var => extend_interp_digit_var;
1177
- interp_var => extend_interp_var;
1178
- c_space+ => extend_string_space;
1179
- c_eol => extend_string_eol;
1180
- c_any => extend_string;
1181
- *|;
1182
-
1183
- plain_backslash_delimited_words := |*
1184
- c_space+ => extend_string_space;
1185
- c_eol => extend_string_eol;
1186
- c_any => extend_string;
1187
- *|;
1188
-
1189
- regexp_modifiers := |*
1190
- [A-Za-z]+
1191
- => {
1192
- unknown_options = tok.scan(/[^imxouesn]/)
1193
- if unknown_options.any?
1194
- diagnostic :error, :regexp_options,
1195
- { :options => unknown_options.join }
1196
- end
1197
-
1198
- emit(:tREGEXP_OPT)
1199
- fnext expr_end;
1200
- fbreak;
1201
- };
1202
-
1203
- any
1204
- => {
1205
- emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
1206
- fhold;
1207
- fgoto expr_end;
1208
- };
1209
- *|;
1210
-
1211
- #
1212
- # === WHITESPACE HANDLING ===
1213
- #
1214
-
1215
- # Various contexts in Ruby allow various kinds of whitespace
1216
- # to be used. They are grouped to clarify the lexing machines
1217
- # and ease collection of comments.
1218
-
1219
- # A line of code with inline #comment at end is always equivalent
1220
- # to a line of code ending with just a newline, so an inline
1221
- # comment is deemed equivalent to non-newline whitespace
1222
- # (c_space character class).
1223
-
1224
- w_space =
1225
- c_space+
1226
- | '\\' e_heredoc_nl
1227
- ;
1228
-
1229
- w_comment =
1230
- '#' %{ @sharp_s = p - 1 }
1231
- # The (p == pe) condition compensates for added "\0" and
1232
- # the way Ragel handles EOF.
1233
- c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) }
1234
- ;
1235
-
1236
- w_space_comment =
1237
- w_space
1238
- | w_comment
1239
- ;
1240
-
1241
- # A newline in non-literal context always interoperates with
1242
- # here document logic and can always be escaped by a backslash,
1243
- # still interoperating with here document logic in the same way,
1244
- # yet being invisible to anything else.
1245
- #
1246
- # To demonstrate:
1247
- #
1248
- # foo = <<FOO \
1249
- # bar
1250
- # FOO
1251
- # + 2
1252
- #
1253
- # is equivalent to `foo = "bar\n" + 2`.
1254
-
1255
- w_newline =
1256
- e_heredoc_nl;
1257
-
1258
- w_any =
1259
- w_space
1260
- | w_comment
1261
- | w_newline
1262
- ;
1263
-
1264
-
1265
- #
1266
- # === EXPRESSION PARSING ===
1267
- #
1268
-
1269
- # These rules implement a form of manually defined lookahead.
1270
- # The default longest-match scanning does not work here due
1271
- # to sheer ambiguity.
1272
-
1273
- ambiguous_fid_suffix = # actual parsed
1274
- [?!] %{ tm = p } | # a? a?
1275
- [?!]'=' %{ tm = p - 2 } # a!=b a != b
1276
- ;
1277
-
1278
- ambiguous_ident_suffix = # actual parsed
1279
- ambiguous_fid_suffix |
1280
- '=' %{ tm = p } | # a= a=
1281
- '==' %{ tm = p - 2 } | # a==b a == b
1282
- '=~' %{ tm = p - 2 } | # a=~b a =~ b
1283
- '=>' %{ tm = p - 2 } | # a=>b a => b
1284
- '===' %{ tm = p - 3 } # a===b a === b
1285
- ;
1286
-
1287
- ambiguous_symbol_suffix = # actual parsed
1288
- ambiguous_ident_suffix |
1289
- '==>' %{ tm = p - 2 } # :a==>b :a= => b
1290
- ;
1291
-
1292
- # Ambiguous with 1.9 hash labels.
1293
- ambiguous_const_suffix = # actual parsed
1294
- '::' %{ tm = p - 2 } # A::B A :: B
1295
- ;
1296
-
1297
- # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
1298
- # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
1299
-
1300
- e_lbrack = '[' % {
1301
- @cond.push(false); @cmdarg.push(false)
1302
-
1303
- @paren_nest += 1
1304
- };
1305
-
1306
- e_rbrack = ']' % {
1307
- @paren_nest -= 1
1308
- };
1309
-
1310
- # Ruby 1.9 lambdas require parentheses counting in order to
1311
- # emit correct opening kDO/tLBRACE.
1312
-
1313
- e_lparen = '(' % {
1314
- @cond.push(false); @cmdarg.push(false)
1315
-
1316
- @paren_nest += 1
1317
-
1318
- if version?(18)
1319
- @command_start = true
1320
- end
1321
- };
1322
-
1323
- e_rparen = ')' % {
1324
- @paren_nest -= 1
1325
- };
1326
-
1327
- # Ruby is context-sensitive wrt/ local identifiers.
1328
- action local_ident {
1329
- emit(:tIDENTIFIER)
1330
-
1331
- if !@static_env.nil? && @static_env.declared?(tok)
1332
- fnext expr_endfn; fbreak;
1333
- else
1334
- fnext *arg_or_cmdarg(cmd_state); fbreak;
1335
- end
1336
- }
1337
-
1338
- # Variable lexing code is accessed from both expressions and
1339
- # string interpolation related code.
1340
- #
1341
- expr_variable := |*
1342
- global_var
1343
- => {
1344
- if tok =~ /^\$([1-9][0-9]*)$/
1345
- emit(:tNTH_REF, tok(@ts + 1).to_i)
1346
- elsif tok =~ /^\$([&`'+])$/
1347
- emit(:tBACK_REF)
1348
- else
1349
- emit(:tGVAR)
1350
- end
1351
-
1352
- fnext *stack_pop; fbreak;
1353
- };
1354
-
1355
- class_var_v
1356
- => {
1357
- if tok =~ /^@@[0-9]/
1358
- diagnostic :error, :cvar_name, { :name => tok }
1359
- end
1360
-
1361
- emit(:tCVAR)
1362
- fnext *stack_pop; fbreak;
1363
- };
1364
-
1365
- instance_var_v
1366
- => {
1367
- if tok =~ /^@[0-9]/
1368
- diagnostic :error, :ivar_name, { :name => tok }
1369
- end
1370
-
1371
- emit(:tIVAR)
1372
- fnext *stack_pop; fbreak;
1373
- };
1374
- *|;
1375
-
1376
- # Literal function name in definition (e.g. `def class`).
1377
- # Keywords are returned as their respective tokens; this is used
1378
- # to support singleton def `def self.foo`. Global variables are
1379
- # returned as `tGVAR`; this is used in global variable alias
1380
- # statements `alias $a $b`. Symbols are returned verbatim; this
1381
- # is used in `alias :a :"b#{foo}"` and `undef :a`.
1382
- #
1383
- # Transitions to `expr_endfn` afterwards.
1384
- #
1385
- expr_fname := |*
1386
- keyword
1387
- => { emit_table(KEYWORDS_BEGIN);
1388
- fnext expr_endfn; fbreak; };
1389
-
1390
- constant
1391
- => { emit(:tCONSTANT)
1392
- fnext expr_endfn; fbreak; };
1393
-
1394
- bareword [?=!]?
1395
- => { emit(:tIDENTIFIER)
1396
- fnext expr_endfn; fbreak; };
1397
-
1398
- global_var
1399
- => { p = @ts - 1
1400
- fnext expr_end; fcall expr_variable; };
1401
-
1402
- # If the handling was to be delegated to expr_end,
1403
- # these cases would transition to something else than
1404
- # expr_endfn, which is incorrect.
1405
- operator_fname |
1406
- operator_arithmetic |
1407
- operator_rest
1408
- => { emit_table(PUNCTUATION)
1409
- fnext expr_endfn; fbreak; };
1410
-
1411
- '::'
1412
- => { fhold; fhold; fgoto expr_end; };
1413
-
1414
- ':'
1415
- => { fhold; fgoto expr_beg; };
1416
-
1417
- '%s' (c_ascii - [A-Za-z0-9])
1418
- => {
1419
- if version?(23)
1420
- type, delimiter = tok[0..-2], tok[-1].chr
1421
- fgoto *push_literal(type, delimiter, @ts);
1422
- else
1423
- p = @ts - 1
1424
- fgoto expr_end;
1425
- end
1426
- };
1427
-
1428
- w_any;
1429
-
1430
- c_any
1431
- => { fhold; fgoto expr_end; };
1432
-
1433
- c_eof => do_eof;
1434
- *|;
1435
-
1436
- # After literal function name in definition. Behaves like `expr_end`,
1437
- # but allows a tLABEL.
1438
- #
1439
- # Transitions to `expr_end` afterwards.
1440
- #
1441
- expr_endfn := |*
1442
- label ( any - ':' )
1443
- => { emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1444
- fhold; fnext expr_labelarg; fbreak; };
1445
-
1446
- '...'
1447
- => {
1448
- if @version >= 31 && @context.in_argdef
1449
- emit(:tBDOT3, '...'.freeze)
1450
- # emit(:tNL, "\n".freeze, @te - 1, @te)
1451
- fnext expr_end; fbreak;
1452
- else
1453
- p -= 3;
1454
- fgoto expr_end;
1455
- end
1456
- };
1457
-
1458
- w_space_comment;
1459
-
1460
- c_any
1461
- => { fhold; fgoto expr_end; };
1462
-
1463
- c_eof => do_eof;
1464
- *|;
1465
-
1466
- # Literal function name in method call (e.g. `a.class`).
1467
- #
1468
- # Transitions to `expr_arg` afterwards.
1469
- #
1470
- expr_dot := |*
1471
- constant
1472
- => { emit(:tCONSTANT)
1473
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
1474
-
1475
- call_or_var
1476
- => { emit(:tIDENTIFIER)
1477
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
1478
-
1479
- bareword ambiguous_fid_suffix
1480
- => { emit(:tFID, tok(@ts, tm), @ts, tm)
1481
- fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; };
1482
-
1483
- # See the comment in `expr_fname`.
1484
- operator_fname |
1485
- operator_arithmetic |
1486
- operator_rest
1487
- => { emit_table(PUNCTUATION)
1488
- fnext expr_arg; fbreak; };
1489
-
1490
- w_any;
1491
-
1492
- c_any
1493
- => { fhold; fgoto expr_end; };
1494
-
1495
- c_eof => do_eof;
1496
- *|;
1497
-
1498
- # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
1499
- # is consumed; the current expression is a command or method call.
1500
- #
1501
- expr_arg := |*
1502
- #
1503
- # COMMAND MODE SPECIFIC TOKENS
1504
- #
1505
-
1506
- # cmd (1 + 2)
1507
- # See below the rationale about expr_endarg.
1508
- w_space+ e_lparen
1509
- => {
1510
- if version?(18)
1511
- emit(:tLPAREN2, '('.freeze, @te - 1, @te)
1512
- fnext expr_value; fbreak;
1513
- else
1514
- emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1515
- fnext expr_beg; fbreak;
1516
- end
1517
- };
1518
-
1519
- # meth(1 + 2)
1520
- # Regular method call.
1521
- e_lparen
1522
- => { emit(:tLPAREN2, '('.freeze)
1523
- fnext expr_beg; fbreak; };
1524
-
1525
- # meth [...]
1526
- # Array argument. Compare with indexing `meth[...]`.
1527
- w_space+ e_lbrack
1528
- => { emit(:tLBRACK, '['.freeze, @te - 1, @te)
1529
- fnext expr_beg; fbreak; };
1530
-
1531
- # cmd {}
1532
- # Command: method call without parentheses.
1533
- w_space* e_lbrace
1534
- => {
1535
- if @lambda_stack.last == @paren_nest
1536
- @lambda_stack.pop
1537
- emit(:tLAMBEG, '{'.freeze, @te - 1, @te)
1538
- else
1539
- emit(:tLCURLY, '{'.freeze, @te - 1, @te)
1540
- end
1541
- @command_start = true
1542
- @paren_nest += 1
1543
- fnext expr_value; fbreak;
1544
- };
1545
-
1546
- #
1547
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
1548
- #
1549
-
1550
- # a??
1551
- # Ternary operator
1552
- '?' c_space_nl
1553
- => {
1554
- # Unlike expr_beg as invoked in the next rule, do not warn
1555
- p = @ts - 1
1556
- fgoto expr_end;
1557
- };
1558
-
1559
- # a ?b, a? ?
1560
- # Character literal or ternary operator
1561
- w_space* '?'
1562
- => { fhold; fgoto expr_beg; };
1563
-
1564
- # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
1565
- # a /foo/ (but not "a / foo" or "a /=foo")
1566
- # a <<HEREDOC
1567
- w_space+ %{ tm = p }
1568
- ( [%/] ( c_any - c_space_nl - '=' ) # /
1569
- | '<<'
1570
- )
1571
- => {
1572
- if tok(tm, tm + 1) == '/'.freeze
1573
- # Ambiguous regexp literal.
1574
- if @version < 30
1575
- diagnostic :warning, :ambiguous_literal, nil, range(tm, tm + 1)
1576
- else
1577
- diagnostic :warning, :ambiguous_regexp, nil, range(tm, tm + 1)
1578
- end
1579
- end
1580
-
1581
- p = tm - 1
1582
- fgoto expr_beg;
1583
- };
1584
-
1585
- # x *1
1586
- # Ambiguous splat, kwsplat or block-pass.
1587
- w_space+ %{ tm = p } ( '+' | '-' | '*' | '&' | '**' )
1588
- => {
1589
- diagnostic :warning, :ambiguous_prefix, { :prefix => tok(tm, @te) },
1590
- range(tm, @te)
1591
-
1592
- p = tm - 1
1593
- fgoto expr_beg;
1594
- };
1595
-
1596
- # x ::Foo
1597
- # Ambiguous toplevel constant access.
1598
- w_space+ '::'
1599
- => { fhold; fhold; fgoto expr_beg; };
1600
-
1601
- # x:b
1602
- # Symbol.
1603
- w_space* ':'
1604
- => { fhold; fgoto expr_beg; };
1605
-
1606
- w_space+ label
1607
- => { p = @ts - 1; fgoto expr_beg; };
1608
-
1609
- #
1610
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
1611
- #
1612
-
1613
- # a ? b
1614
- # Ternary operator.
1615
- w_space+ %{ tm = p } '?' c_space_nl
1616
- => { p = tm - 1; fgoto expr_end; };
1617
-
1618
- # x + 1: Binary operator or operator-assignment.
1619
- w_space* operator_arithmetic
1620
- ( '=' | c_space_nl )? |
1621
- # x rescue y: Modifier keyword.
1622
- w_space* keyword_modifier |
1623
- # a &. b: Safe navigation operator.
1624
- w_space* '&.' |
1625
- # Miscellanea.
1626
- w_space* punctuation_end
1627
- => {
1628
- p = @ts - 1
1629
- fgoto expr_end;
1630
- };
1631
-
1632
- w_space;
1633
-
1634
- w_comment
1635
- => { fgoto expr_end; };
1636
-
1637
- w_newline
1638
- => { fhold; fgoto expr_end; };
1639
-
1640
- c_any
1641
- => { fhold; fgoto expr_beg; };
1642
-
1643
- c_eof => do_eof;
1644
- *|;
1645
-
1646
- # The previous token was an identifier which was seen while in the
1647
- # command mode (that is, the state at the beginning of #advance was
1648
- # expr_value). This state is very similar to expr_arg, but disambiguates
1649
- # two very rare and specific condition:
1650
- # * In 1.8 mode, "foo (lambda do end)".
1651
- # * In 1.9+ mode, "f x: -> do foo do end end".
1652
- expr_cmdarg := |*
1653
- w_space+ e_lparen
1654
- => {
1655
- emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1656
- if version?(18)
1657
- fnext expr_value; fbreak;
1658
- else
1659
- fnext expr_beg; fbreak;
1660
- end
1661
- };
1662
-
1663
- w_space* 'do'
1664
- => {
1665
- if @cond.active?
1666
- emit(:kDO_COND, 'do'.freeze, @te - 2, @te)
1667
- else
1668
- emit(:kDO, 'do'.freeze, @te - 2, @te)
1669
- end
1670
- fnext expr_value; fbreak;
1671
- };
1672
-
1673
- c_any |
1674
- # Disambiguate with the `do' rule above.
1675
- w_space* bareword |
1676
- w_space* label
1677
- => { p = @ts - 1
1678
- fgoto expr_arg; };
1679
-
1680
- c_eof => do_eof;
1681
- *|;
1682
-
1683
- # The rationale for this state is pretty complex. Normally, if an argument
1684
- # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1685
- # the block is attached to the innermost argument (`f` in `m f {}`), or it
1686
- # is a parse error (`m 1 {}`). But there is a special case for passing a single
1687
- # primary expression grouped with parentheses: if you write `m (1) {}` or
1688
- # (2.0 only) `m () {}`, then the block is attached to `m`.
1689
- #
1690
- # Thus, we recognize the opening `(` of a command (remember, a command is
1691
- # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
1692
- # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
1693
- # lexer's state to `expr_endarg`, which makes it emit the possibly following
1694
- # `{` as `tLBRACE_ARG`.
1695
- #
1696
- # The default post-`expr_endarg` state is `expr_end`, so this state also handles
1697
- # `do` (as `kDO_BLOCK` in `expr_beg`).
1698
- expr_endarg := |*
1699
- e_lbrace
1700
- => {
1701
- if @lambda_stack.last == @paren_nest
1702
- @lambda_stack.pop
1703
- emit(:tLAMBEG, '{'.freeze)
1704
- else
1705
- emit(:tLBRACE_ARG, '{'.freeze)
1706
- end
1707
- @paren_nest += 1
1708
- @command_start = true
1709
- fnext expr_value; fbreak;
1710
- };
1711
-
1712
- 'do'
1713
- => { emit_do(true)
1714
- fnext expr_value; fbreak; };
1715
-
1716
- w_space_comment;
1717
-
1718
- c_any
1719
- => { fhold; fgoto expr_end; };
1720
-
1721
- c_eof => do_eof;
1722
- *|;
1723
-
1724
- # The rationale for this state is that several keywords accept value
1725
- # (i.e. should transition to `expr_beg`), do not accept it like a command
1726
- # (i.e. not an `expr_arg`), and must behave like a statement, that is,
1727
- # accept a modifier if/while/etc.
1728
- #
1729
- expr_mid := |*
1730
- keyword_modifier
1731
- => { emit_table(KEYWORDS)
1732
- fnext expr_beg; fbreak; };
1733
-
1734
- bareword
1735
- => { p = @ts - 1; fgoto expr_beg; };
1736
-
1737
- w_space_comment;
1738
-
1739
- w_newline
1740
- => { fhold; fgoto expr_end; };
1741
-
1742
- c_any
1743
- => { fhold; fgoto expr_beg; };
1744
-
1745
- c_eof => do_eof;
1746
- *|;
1747
-
1748
- # Beginning of an expression.
1749
- #
1750
- # Don't fallthrough to this state from `c_any`; make sure to handle
1751
- # `c_space* c_nl` and let `expr_end` handle the newline.
1752
- # Otherwise code like `f\ndef x` gets glued together and the parser
1753
- # explodes.
1754
- #
1755
- expr_beg := |*
1756
- # +5, -5, - 5
1757
- [+\-] w_any* [0-9]
1758
- => {
1759
- emit(:tUNARY_NUM, tok(@ts, @ts + 1), @ts, @ts + 1)
1760
- fhold; fnext expr_end; fbreak;
1761
- };
1762
-
1763
- # splat *a
1764
- '*'
1765
- => { emit(:tSTAR, '*'.freeze)
1766
- fbreak; };
1767
-
1768
- #
1769
- # STRING AND REGEXP LITERALS
1770
- #
1771
-
1772
- # /regexp/oui
1773
- # /=/ (disambiguation with /=)
1774
- '/' c_any
1775
- => {
1776
- type = delimiter = tok[0].chr
1777
- fhold; fgoto *push_literal(type, delimiter, @ts);
1778
- };
1779
-
1780
- # %<string>
1781
- '%' ( c_ascii - [A-Za-z0-9] )
1782
- => {
1783
- type, delimiter = @source_buffer.slice(@ts).chr, tok[-1].chr
1784
- fgoto *push_literal(type, delimiter, @ts);
1785
- };
1786
-
1787
- # %w(we are the people)
1788
- '%' [A-Za-z] (c_ascii - [A-Za-z0-9])
1789
- => {
1790
- type, delimiter = tok[0..-2], tok[-1].chr
1791
- fgoto *push_literal(type, delimiter, @ts);
1792
- };
1793
-
1794
- '%' c_eof
1795
- => {
1796
- diagnostic :fatal, :string_eof, nil, range(@ts, @ts + 1)
1797
- };
1798
-
1799
- # Heredoc start.
1800
- # <<END | <<'END' | <<"END" | <<`END` |
1801
- # <<-END | <<-'END' | <<-"END" | <<-`END` |
1802
- # <<~END | <<~'END' | <<~"END" | <<~`END`
1803
- '<<' [~\-]?
1804
- ( '"' ( any - '"' )* '"'
1805
- | "'" ( any - "'" )* "'"
1806
- | "`" ( any - "`" )* "`"
1807
- | bareword ) % { heredoc_e = p }
1808
- c_line* c_nl % { new_herebody_s = p }
1809
- => {
1810
- tok(@ts, heredoc_e) =~ /^<<(-?)(~?)(["'`]?)(.*)\3$/m
1811
-
1812
- indent = !$1.empty? || !$2.empty?
1813
- dedent_body = !$2.empty?
1814
- type = $3.empty? ? '<<"'.freeze : ('<<'.freeze + $3)
1815
- delimiter = $4
1816
-
1817
- if @version >= 27
1818
- if delimiter.count("\n") > 0 || delimiter.count("\r") > 0
1819
- diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
1820
- end
1821
- elsif @version >= 24
1822
- if delimiter.count("\n") > 0
1823
- if delimiter.end_with?("\n")
1824
- diagnostic :warning, :heredoc_id_ends_with_nl, nil, range(@ts, @ts + 1)
1825
- delimiter = delimiter.rstrip
1826
- else
1827
- diagnostic :fatal, :heredoc_id_has_newline, nil, range(@ts, @ts + 1)
1828
- end
1829
- end
1830
- end
1831
-
1832
- if dedent_body && version?(18, 19, 20, 21, 22)
1833
- emit(:tLSHFT, '<<'.freeze, @ts, @ts + 2)
1834
- p = @ts + 1
1835
- fnext expr_beg; fbreak;
1836
- else
1837
- fnext *push_literal(type, delimiter, @ts, heredoc_e, indent, dedent_body);
1838
-
1839
- @herebody_s ||= new_herebody_s
1840
- p = @herebody_s - 1
1841
- end
1842
- };
1843
-
1844
- # Escaped unterminated heredoc start
1845
- # <<'END | <<"END | <<`END |
1846
- # <<-'END | <<-"END | <<-`END |
1847
- # <<~'END | <<~"END | <<~`END
1848
- #
1849
- # If the heredoc is terminated the rule above should handle it
1850
- '<<' [~\-]?
1851
- ('"' (any - c_nl - '"')*
1852
- |"'" (any - c_nl - "'")*
1853
- |"`" (any - c_nl - "`")
1854
- )
1855
- => {
1856
- diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1)
1857
- };
1858
-
1859
- #
1860
- # SYMBOL LITERALS
1861
- #
1862
-
1863
- # :&&, :||
1864
- ':' ('&&' | '||') => {
1865
- fhold; fhold;
1866
- emit(:tSYMBEG, tok(@ts, @ts + 1), @ts, @ts + 1)
1867
- fgoto expr_fname;
1868
- };
1869
-
1870
- # :"bar", :'baz'
1871
- ':' ['"] # '
1872
- => {
1873
- type, delimiter = tok, tok[-1].chr
1874
- fgoto *push_literal(type, delimiter, @ts);
1875
- };
1876
-
1877
- # :!@ is :!
1878
- # :~@ is :~
1879
- ':' [!~] '@'
1880
- => {
1881
- emit(:tSYMBOL, tok(@ts + 1, @ts + 2))
1882
- fnext expr_end; fbreak;
1883
- };
1884
-
1885
- ':' bareword ambiguous_symbol_suffix
1886
- => {
1887
- emit(:tSYMBOL, tok(@ts + 1, tm), @ts, tm)
1888
- p = tm - 1
1889
- fnext expr_end; fbreak;
1890
- };
1891
-
1892
- ':' ( bareword | global_var | class_var | instance_var |
1893
- operator_fname | operator_arithmetic | operator_rest )
1894
- => {
1895
- emit(:tSYMBOL, tok(@ts + 1), @ts)
1896
- fnext expr_end; fbreak;
1897
- };
1898
-
1899
- ':' ( '@' %{ tm = p - 1; diag_msg = :ivar_name }
1900
- | '@@' %{ tm = p - 2; diag_msg = :cvar_name }
1901
- ) [0-9]*
1902
- => {
1903
- if @version >= 27
1904
- diagnostic :error, diag_msg, { name: tok(tm, @te) }, range(tm, @te)
1905
- else
1906
- emit(:tCOLON, tok(@ts, @ts + 1), @ts, @ts + 1)
1907
- p = @ts
1908
- end
1909
-
1910
- fnext expr_end; fbreak;
1911
- };
1912
-
1913
- #
1914
- # AMBIGUOUS TERNARY OPERATOR
1915
- #
1916
-
1917
- # Character constant, like ?a, ?\n, ?\u1000, and so on
1918
- # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
1919
- '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
1920
- | (c_any - c_space_nl - e_bs) % { @escape = nil }
1921
- )
1922
- => {
1923
- value = @escape || tok(@ts + 1)
1924
-
1925
- if version?(18)
1926
- emit(:tINTEGER, value.getbyte(0))
1927
- else
1928
- emit(:tCHARACTER, value)
1929
- end
1930
-
1931
- fnext expr_end; fbreak;
1932
- };
1933
-
1934
- '?' c_space_nl
1935
- => {
1936
- escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
1937
- "\v" => '\v', "\f" => '\f' }[@source_buffer.slice(@ts + 1)]
1938
- diagnostic :warning, :invalid_escape_use, { :escape => escape }, range
1939
-
1940
- p = @ts - 1
1941
- fgoto expr_end;
1942
- };
1943
-
1944
- '?' c_eof
1945
- => {
1946
- diagnostic :fatal, :incomplete_escape, nil, range(@ts, @ts + 1)
1947
- };
1948
-
1949
- # f ?aa : b: Disambiguate with a character literal.
1950
- '?' [A-Za-z_] bareword
1951
- => {
1952
- p = @ts - 1
1953
- fgoto expr_end;
1954
- };
1955
-
1956
- #
1957
- # AMBIGUOUS EMPTY BLOCK ARGUMENTS
1958
- #
1959
-
1960
- # Ruby >= 2.7 emits it as two tPIPE terminals
1961
- # while Ruby < 2.7 as a single tOROP (like in `a || b`)
1962
- '||'
1963
- => {
1964
- if @version >= 27
1965
- emit(:tPIPE, tok(@ts, @ts + 1), @ts, @ts + 1)
1966
- fhold;
1967
- fnext expr_beg; fbreak;
1968
- else
1969
- p -= 2
1970
- fgoto expr_end;
1971
- end
1972
- };
1973
-
1974
- #
1975
- # KEYWORDS AND PUNCTUATION
1976
- #
1977
-
1978
- # a({b=>c})
1979
- e_lbrace
1980
- => {
1981
- if @lambda_stack.last == @paren_nest
1982
- @lambda_stack.pop
1983
- @command_start = true
1984
- emit(:tLAMBEG, '{'.freeze)
1985
- else
1986
- emit(:tLBRACE, '{'.freeze)
1987
- end
1988
- @paren_nest += 1
1989
- fbreak;
1990
- };
1991
-
1992
- # a([1, 2])
1993
- e_lbrack
1994
- => { emit(:tLBRACK, '['.freeze)
1995
- fbreak; };
1996
-
1997
- # a()
1998
- e_lparen
1999
- => { emit(:tLPAREN, '('.freeze)
2000
- fbreak; };
2001
-
2002
- # a(+b)
2003
- punctuation_begin
2004
- => { emit_table(PUNCTUATION_BEGIN)
2005
- fbreak; };
2006
-
2007
- # rescue Exception => e: Block rescue.
2008
- # Special because it should transition to expr_mid.
2009
- 'rescue' %{ tm = p } '=>'?
2010
- => { emit(:kRESCUE, 'rescue'.freeze, @ts, tm)
2011
- p = tm - 1
2012
- fnext expr_mid; fbreak; };
2013
-
2014
- # if a: Statement if.
2015
- keyword_modifier
2016
- => { emit_table(KEYWORDS_BEGIN)
2017
- @command_start = true
2018
- fnext expr_value; fbreak; };
2019
-
2020
- #
2021
- # RUBY 1.9 HASH LABELS
2022
- #
2023
-
2024
- label ( any - ':' )
2025
- => {
2026
- fhold;
2027
-
2028
- if version?(18)
2029
- ident = tok(@ts, @te - 2)
2030
-
2031
- emit((@source_buffer.slice(@ts) =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
2032
- ident, @ts, @te - 2)
2033
- fhold; # continue as a symbol
2034
-
2035
- if !@static_env.nil? && @static_env.declared?(ident)
2036
- fnext expr_end;
2037
- else
2038
- fnext *arg_or_cmdarg(cmd_state);
2039
- end
2040
- else
2041
- emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
2042
- fnext expr_labelarg;
2043
- end
2044
-
2045
- fbreak;
2046
- };
2047
-
2048
- #
2049
- # RUBY 2.7 BEGINLESS RANGE
2050
-
2051
- '..'
2052
- => {
2053
- if @version >= 27
2054
- emit(:tBDOT2)
2055
- else
2056
- emit(:tDOT2)
2057
- end
2058
-
2059
- fnext expr_beg; fbreak;
2060
- };
2061
-
2062
- '...' c_nl?
2063
- => {
2064
- # Here we scan and conditionally emit "\n":
2065
- # + if it's there
2066
- # + and emitted we do nothing
2067
- # + and not emitted we return `p` to "\n" to process it on the next scan
2068
- # + if it's not there we do nothing
2069
- followed_by_nl = @te - 1 == @newline_s
2070
- nl_emitted = false
2071
- dots_te = followed_by_nl ? @te - 1 : @te
2072
-
2073
- if @version >= 30
2074
- if @lambda_stack.any? && @lambda_stack.last + 1 == @paren_nest
2075
- # To reject `->(...)` like `->...`
2076
- emit(:tDOT3, '...'.freeze, @ts, dots_te)
2077
- else
2078
- emit(:tBDOT3, '...'.freeze, @ts, dots_te)
2079
-
2080
- if @version >= 31 && followed_by_nl && @context.in_argdef
2081
- emit(:tNL, @te - 1, @te)
2082
- nl_emitted = true
2083
- end
2084
- end
2085
- elsif @version >= 27
2086
- emit(:tBDOT3, '...'.freeze, @ts, dots_te)
2087
- else
2088
- emit(:tDOT3, '...'.freeze, @ts, dots_te)
2089
- end
2090
-
2091
- if followed_by_nl && !nl_emitted
2092
- # return "\n" to process it on the next scan
2093
- fhold;
2094
- end
2095
-
2096
- fnext expr_beg; fbreak;
2097
- };
2098
-
2099
- #
2100
- # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
2101
- #
2102
-
2103
- # foo= bar: Disambiguate with bareword rule below.
2104
- bareword ambiguous_ident_suffix |
2105
- # def foo: Disambiguate with bareword rule below.
2106
- keyword
2107
- => { p = @ts - 1
2108
- fgoto expr_end; };
2109
-
2110
- # a = 42; a [42]: Indexing.
2111
- # def a; end; a [42]: Array argument.
2112
- call_or_var
2113
- => local_ident;
2114
-
2115
- (call_or_var - keyword)
2116
- % { ident_tok = tok; ident_ts = @ts; ident_te = @te; }
2117
- w_space+ '('
2118
- => {
2119
- emit(:tIDENTIFIER, ident_tok, ident_ts, ident_te)
2120
- p = ident_te - 1
2121
-
2122
- if !@static_env.nil? && @static_env.declared?(ident_tok) && @version < 25
2123
- fnext expr_endfn;
2124
- else
2125
- fnext expr_cmdarg;
2126
- end
2127
- fbreak;
2128
- };
2129
-
2130
- #
2131
- # WHITESPACE
2132
- #
2133
-
2134
- w_any;
2135
-
2136
- e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
2137
- => {
2138
- p = @ts - 1
2139
- @cs_before_block_comment = @cs
2140
- fgoto line_begin;
2141
- };
2142
-
2143
- #
2144
- # DEFAULT TRANSITION
2145
- #
2146
-
2147
- # The following rules match most binary and all unary operators.
2148
- # Rules for binary operators provide better error reporting.
2149
- operator_arithmetic '=' |
2150
- operator_rest |
2151
- punctuation_end |
2152
- c_any
2153
- => { p = @ts - 1; fgoto expr_end; };
2154
-
2155
- c_eof => do_eof;
2156
- *|;
2157
-
2158
- # Special newline handling for "def a b:"
2159
- #
2160
- expr_labelarg := |*
2161
- w_space_comment;
2162
-
2163
- w_newline
2164
- => {
2165
- if @context.in_kwarg
2166
- fhold; fgoto expr_end;
2167
- else
2168
- fgoto line_begin;
2169
- end
2170
- };
2171
-
2172
- c_any
2173
- => { fhold; fgoto expr_beg; };
2174
-
2175
- c_eof => do_eof;
2176
- *|;
2177
-
2178
- # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
2179
- #
2180
- expr_value := |*
2181
- # a:b: a(:b), a::B, A::B
2182
- label (any - ':')
2183
- => { p = @ts - 1
2184
- fgoto expr_end; };
2185
-
2186
- # "bar", 'baz'
2187
- ['"] # '
2188
- => {
2189
- fgoto *push_literal(tok, tok, @ts);
2190
- };
2191
-
2192
- w_space_comment;
2193
-
2194
- w_newline
2195
- => { fgoto line_begin; };
2196
-
2197
- c_any
2198
- => { fhold; fgoto expr_beg; };
2199
-
2200
- c_eof => do_eof;
2201
- *|;
2202
-
2203
- expr_end := |*
2204
- #
2205
- # STABBY LAMBDA
2206
- #
2207
-
2208
- '->'
2209
- => {
2210
- emit(:tLAMBDA, '->'.freeze, @ts, @ts + 2)
2211
-
2212
- @lambda_stack.push @paren_nest
2213
- fnext expr_endfn; fbreak;
2214
- };
2215
-
2216
- e_lbrace | 'do'
2217
- => {
2218
- if @lambda_stack.last == @paren_nest
2219
- @lambda_stack.pop
2220
-
2221
- if tok == '{'.freeze
2222
- emit(:tLAMBEG, '{'.freeze)
2223
- else # 'do'
2224
- emit(:kDO_LAMBDA, 'do'.freeze)
2225
- end
2226
- else
2227
- if tok == '{'.freeze
2228
- emit(:tLCURLY, '{'.freeze)
2229
- else # 'do'
2230
- emit_do
2231
- end
2232
- end
2233
- if tok == '{'.freeze
2234
- @paren_nest += 1
2235
- end
2236
- @command_start = true
2237
-
2238
- fnext expr_value; fbreak;
2239
- };
2240
-
2241
- #
2242
- # KEYWORDS
2243
- #
2244
-
2245
- keyword_with_fname
2246
- => { emit_table(KEYWORDS)
2247
- fnext expr_fname; fbreak; };
2248
-
2249
- 'class' w_any* '<<'
2250
- => { emit(:kCLASS, 'class'.freeze, @ts, @ts + 5)
2251
- emit(:tLSHFT, '<<'.freeze, @te - 2, @te)
2252
- fnext expr_value; fbreak; };
2253
-
2254
- # a if b:c: Syntax error.
2255
- keyword_modifier
2256
- => { emit_table(KEYWORDS)
2257
- fnext expr_beg; fbreak; };
2258
-
2259
- # elsif b:c: elsif b(:c)
2260
- keyword_with_value
2261
- => { emit_table(KEYWORDS)
2262
- @command_start = true
2263
- fnext expr_value; fbreak; };
2264
-
2265
- keyword_with_mid
2266
- => { emit_table(KEYWORDS)
2267
- fnext expr_mid; fbreak; };
2268
-
2269
- keyword_with_arg
2270
- => {
2271
- emit_table(KEYWORDS)
2272
-
2273
- if version?(18) && tok == 'not'.freeze
2274
- fnext expr_beg; fbreak;
2275
- else
2276
- fnext expr_arg; fbreak;
2277
- end
2278
- };
2279
-
2280
- '__ENCODING__'
2281
- => {
2282
- if version?(18)
2283
- emit(:tIDENTIFIER)
2284
-
2285
- unless !@static_env.nil? && @static_env.declared?(tok)
2286
- fnext *arg_or_cmdarg(cmd_state);
2287
- end
2288
- else
2289
- emit(:k__ENCODING__, '__ENCODING__'.freeze)
2290
- end
2291
- fbreak;
2292
- };
2293
-
2294
- keyword_with_end
2295
- => { emit_table(KEYWORDS)
2296
- fbreak; };
2297
-
2298
- #
2299
- # NUMERIC LITERALS
2300
- #
2301
-
2302
- ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p } int_hex
2303
- | '0' [Dd] %{ @num_base = 10; @num_digits_s = p } int_dec
2304
- | '0' [Oo] %{ @num_base = 8; @num_digits_s = p } int_dec
2305
- | '0' [Bb] %{ @num_base = 2; @num_digits_s = p } int_bin
2306
- | [1-9] digit* '_'? %{ @num_base = 10; @num_digits_s = @ts } int_dec
2307
- | '0' digit* '_'? %{ @num_base = 8; @num_digits_s = @ts } int_dec
2308
- ) %{ @num_suffix_s = p } int_suffix
2309
- => {
2310
- digits = tok(@num_digits_s, @num_suffix_s)
2311
-
2312
- if digits.end_with? '_'.freeze
2313
- diagnostic :error, :trailing_in_number, { :character => '_'.freeze },
2314
- range(@te - 1, @te)
2315
- elsif digits.empty? && @num_base == 8 && version?(18)
2316
- # 1.8 did not raise an error on 0o.
2317
- digits = '0'.freeze
2318
- elsif digits.empty?
2319
- diagnostic :error, :empty_numeric
2320
- elsif @num_base == 8 && (invalid_idx = digits.index(/[89]/))
2321
- invalid_s = @num_digits_s + invalid_idx
2322
- diagnostic :error, :invalid_octal, nil,
2323
- range(invalid_s, invalid_s + 1)
2324
- end
2325
-
2326
- if version?(18, 19, 20)
2327
- emit(:tINTEGER, digits.to_i(@num_base), @ts, @num_suffix_s)
2328
- p = @num_suffix_s - 1
2329
- else
2330
- @num_xfrm.call(digits.to_i(@num_base))
2331
- end
2332
- fbreak;
2333
- };
2334
-
2335
- flo_frac flo_pow?
2336
- => {
2337
- diagnostic :error, :no_dot_digit_literal
2338
- };
2339
-
2340
- flo_int [eE]
2341
- => {
2342
- if version?(18, 19, 20)
2343
- diagnostic :error,
2344
- :trailing_in_number, { :character => tok(@te - 1, @te) },
2345
- range(@te - 1, @te)
2346
- else
2347
- emit(:tINTEGER, tok(@ts, @te - 1).to_i, @ts, @te - 1)
2348
- fhold; fbreak;
2349
- end
2350
- };
2351
-
2352
- flo_int flo_frac [eE]
2353
- => {
2354
- if version?(18, 19, 20)
2355
- diagnostic :error,
2356
- :trailing_in_number, { :character => tok(@te - 1, @te) },
2357
- range(@te - 1, @te)
2358
- else
2359
- emit(:tFLOAT, tok(@ts, @te - 1).to_f, @ts, @te - 1)
2360
- fhold; fbreak;
2361
- end
2362
- };
2363
-
2364
- flo_int
2365
- ( flo_frac? flo_pow %{ @num_suffix_s = p } flo_pow_suffix
2366
- | flo_frac %{ @num_suffix_s = p } flo_suffix
2367
- )
2368
- => {
2369
- digits = tok(@ts, @num_suffix_s)
2370
-
2371
- if version?(18, 19, 20)
2372
- emit(:tFLOAT, Float(digits), @ts, @num_suffix_s)
2373
- p = @num_suffix_s - 1
2374
- else
2375
- @num_xfrm.call(digits)
2376
- end
2377
- fbreak;
2378
- };
2379
-
2380
- #
2381
- # STRING AND XSTRING LITERALS
2382
- #
2383
-
2384
- # `echo foo`, "bar", 'baz'
2385
- '`' | ['"] # '
2386
- => {
2387
- type, delimiter = tok, tok[-1].chr
2388
- fgoto *push_literal(type, delimiter, @ts, nil, false, false, true);
2389
- };
2390
-
2391
- #
2392
- # CONSTANTS AND VARIABLES
2393
- #
2394
-
2395
- constant
2396
- => { emit(:tCONSTANT)
2397
- fnext *arg_or_cmdarg(cmd_state); fbreak; };
2398
-
2399
- constant ambiguous_const_suffix
2400
- => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
2401
- p = tm - 1; fbreak; };
2402
-
2403
- global_var | class_var_v | instance_var_v
2404
- => { p = @ts - 1; fcall expr_variable; };
2405
-
2406
- #
2407
- # METHOD CALLS
2408
- #
2409
-
2410
- '.:' w_space+
2411
- => { emit(:tDOT, '.', @ts, @ts + 1)
2412
- emit(:tCOLON, ':', @ts + 1, @ts + 2)
2413
- p = p - tok.length + 2
2414
- fnext expr_dot; fbreak; };
2415
-
2416
- '.:'
2417
- => {
2418
- if @version >= 27
2419
- emit_table(PUNCTUATION)
2420
- else
2421
- emit(:tDOT, tok(@ts, @ts + 1), @ts, @ts + 1)
2422
- fhold;
2423
- end
2424
-
2425
- fnext expr_dot; fbreak;
2426
- };
2427
-
2428
- '.' | '&.' | '::'
2429
- => { emit_table(PUNCTUATION)
2430
- fnext expr_dot; fbreak; };
2431
-
2432
- call_or_var
2433
- => local_ident;
2434
-
2435
- bareword ambiguous_fid_suffix
2436
- => {
2437
- if tm == @te
2438
- # Suffix was consumed, e.g. foo!
2439
- emit(:tFID)
2440
- else
2441
- # Suffix was not consumed, e.g. foo!=
2442
- emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
2443
- p = tm - 1
2444
- end
2445
- fnext expr_arg; fbreak;
2446
- };
2447
-
2448
- #
2449
- # OPERATORS
2450
- #
2451
-
2452
- '*' | '=>'
2453
- => {
2454
- emit_table(PUNCTUATION)
2455
- fnext expr_value; fbreak;
2456
- };
2457
-
2458
- # When '|', '~', '!', '=>' are used as operators
2459
- # they do not accept any symbols (or quoted labels) after.
2460
- # Other binary operators accept it.
2461
- ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' )
2462
- => {
2463
- emit_table(PUNCTUATION);
2464
- fnext expr_value; fbreak;
2465
- };
2466
-
2467
- ( e_lparen | '|' | '~' | '!' )
2468
- => { emit_table(PUNCTUATION)
2469
- fnext expr_beg; fbreak; };
2470
-
2471
- e_rbrace | e_rparen | e_rbrack
2472
- => {
2473
- emit_table(PUNCTUATION)
2474
-
2475
- if @version < 24
2476
- @cond.lexpop
2477
- @cmdarg.lexpop
2478
- else
2479
- @cond.pop
2480
- @cmdarg.pop
2481
- end
2482
-
2483
- if tok == '}'.freeze || tok == ']'.freeze
2484
- if @version >= 25
2485
- fnext expr_end;
2486
- else
2487
- fnext expr_endarg;
2488
- end
2489
- else # )
2490
- # fnext expr_endfn; ?
2491
- end
2492
-
2493
- fbreak;
2494
- };
2495
-
2496
- operator_arithmetic '='
2497
- => { emit(:tOP_ASGN, tok(@ts, @te - 1))
2498
- fnext expr_beg; fbreak; };
2499
-
2500
- '?'
2501
- => { emit(:tEH, '?'.freeze)
2502
- fnext expr_value; fbreak; };
2503
-
2504
- e_lbrack
2505
- => { emit(:tLBRACK2, '['.freeze)
2506
- fnext expr_beg; fbreak; };
2507
-
2508
- '...' c_nl
2509
- => {
2510
- if @paren_nest == 0
2511
- diagnostic :warning, :triple_dot_at_eol, nil, range(@ts, @te - 1)
2512
- end
2513
-
2514
- emit(:tDOT3, '...'.freeze, @ts, @te - 1)
2515
- fhold;
2516
- fnext expr_beg; fbreak;
2517
- };
2518
-
2519
- punctuation_end
2520
- => { emit_table(PUNCTUATION)
2521
- fnext expr_beg; fbreak; };
2522
-
2523
- #
2524
- # WHITESPACE
2525
- #
2526
-
2527
- w_space_comment;
2528
-
2529
- w_newline
2530
- => { fgoto leading_dot; };
2531
-
2532
- ';'
2533
- => { emit(:tSEMI, ';'.freeze)
2534
- @command_start = true
2535
- fnext expr_value; fbreak; };
2536
-
2537
- '\\' c_line {
2538
- diagnostic :error, :bare_backslash, nil, range(@ts, @ts + 1)
2539
- fhold;
2540
- };
2541
-
2542
- c_any
2543
- => {
2544
- diagnostic :fatal, :unexpected, { :character => tok.inspect[1..-2] }
2545
- };
2546
-
2547
- c_eof => do_eof;
2548
- *|;
2549
-
2550
- leading_dot := |*
2551
- # Insane leading dots:
2552
- # a #comment
2553
- # # post-2.7 comment
2554
- # .b: a.b
2555
-
2556
- # Here we use '\n' instead of w_newline to not modify @newline_s
2557
- # and eventually properly emit tNL
2558
- (c_space* w_space_comment '\n')+
2559
- => {
2560
- if @version < 27
2561
- # Ruby before 2.7 doesn't support comments before leading dot.
2562
- # If a line after "a" starts with a comment then "a" is a self-contained statement.
2563
- # So in that case we emit a special tNL token and start reading the
2564
- # next line as a separate statement.
2565
- #
2566
- # Note: block comments before leading dot are not supported on any version of Ruby.
2567
- emit(:tNL, nil, @newline_s, @newline_s + 1)
2568
- fhold; fnext line_begin; fbreak;
2569
- end
2570
- };
2571
-
2572
- c_space* '..'
2573
- => {
2574
- emit(:tNL, nil, @newline_s, @newline_s + 1)
2575
- if @version < 27
2576
- fhold; fnext line_begin; fbreak;
2577
- else
2578
- emit(:tBDOT2)
2579
- fnext expr_beg; fbreak;
2580
- end
2581
- };
2582
-
2583
- c_space* '...'
2584
- => {
2585
- emit(:tNL, nil, @newline_s, @newline_s + 1)
2586
- if @version < 27
2587
- fhold; fnext line_begin; fbreak;
2588
- else
2589
- emit(:tBDOT3)
2590
- fnext expr_beg; fbreak;
2591
- end
2592
- };
2593
-
2594
- c_space* %{ tm = p } ('.' | '&.')
2595
- => { p = tm - 1; fgoto expr_end; };
2596
-
2597
- any
2598
- => { emit(:tNL, nil, @newline_s, @newline_s + 1)
2599
- fhold; fnext line_begin; fbreak; };
2600
- *|;
2601
-
2602
- #
2603
- # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
2604
- #
2605
-
2606
- line_comment := |*
2607
- '=end' c_line* c_nl_zlen
2608
- => {
2609
- emit_comment(@eq_begin_s, @te)
2610
- fgoto *@cs_before_block_comment;
2611
- };
2612
-
2613
- c_line* c_nl;
2614
-
2615
- c_line* zlen
2616
- => {
2617
- diagnostic :fatal, :embedded_document, nil,
2618
- range(@eq_begin_s, @eq_begin_s + '=begin'.length)
2619
- };
2620
- *|;
2621
-
2622
- line_begin := |*
2623
- w_any;
2624
-
2625
- '=begin' ( c_space | c_nl_zlen )
2626
- => { @eq_begin_s = @ts
2627
- fgoto line_comment; };
2628
-
2629
- '__END__' ( c_eol - zlen )
2630
- => { p = pe - 3 };
2631
-
2632
- c_any
2633
- => { cmd_state = true; fhold; fgoto expr_value; };
2634
-
2635
- c_eof => do_eof;
2636
- *|;
2637
-
2638
- }%%
2639
- # %
2640
- end
2641
- end