parser 2.5.1.0 → 3.0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/lib/parser.rb +4 -0
  3. data/lib/parser/all.rb +3 -0
  4. data/lib/parser/ast/processor.rb +49 -1
  5. data/lib/parser/base.rb +30 -6
  6. data/lib/parser/builders/default.rb +586 -29
  7. data/lib/parser/context.rb +17 -0
  8. data/lib/parser/current.rb +34 -7
  9. data/lib/parser/current_arg_stack.rb +46 -0
  10. data/lib/parser/diagnostic.rb +1 -1
  11. data/lib/parser/diagnostic/engine.rb +1 -2
  12. data/lib/parser/lexer.rb +23780 -0
  13. data/lib/parser/lexer/dedenter.rb +52 -49
  14. data/lib/parser/lexer/literal.rb +4 -0
  15. data/lib/parser/lexer/stack_state.rb +4 -0
  16. data/lib/parser/macruby.rb +6149 -0
  17. data/lib/parser/max_numparam_stack.rb +56 -0
  18. data/lib/parser/messages.rb +74 -44
  19. data/lib/parser/meta.rb +13 -3
  20. data/lib/parser/ruby18.rb +5667 -0
  21. data/lib/parser/ruby19.rb +6092 -0
  22. data/lib/parser/ruby20.rb +6527 -0
  23. data/lib/parser/ruby21.rb +6578 -0
  24. data/lib/parser/ruby22.rb +6613 -0
  25. data/lib/parser/ruby23.rb +6624 -0
  26. data/lib/parser/ruby24.rb +6694 -0
  27. data/lib/parser/ruby25.rb +6662 -0
  28. data/lib/parser/ruby26.rb +6676 -0
  29. data/lib/parser/ruby27.rb +7862 -0
  30. data/lib/parser/ruby28.rb +8047 -0
  31. data/lib/parser/ruby30.rb +8060 -0
  32. data/lib/parser/ruby31.rb +8075 -0
  33. data/lib/parser/rubymotion.rb +6086 -0
  34. data/lib/parser/runner.rb +36 -2
  35. data/lib/parser/runner/ruby_parse.rb +2 -2
  36. data/lib/parser/runner/ruby_rewrite.rb +2 -2
  37. data/lib/parser/source/buffer.rb +54 -29
  38. data/lib/parser/source/comment.rb +18 -5
  39. data/lib/parser/source/comment/associator.rb +34 -11
  40. data/lib/parser/source/map.rb +1 -1
  41. data/lib/parser/source/map/method_definition.rb +25 -0
  42. data/lib/parser/source/range.rb +20 -4
  43. data/lib/parser/source/tree_rewriter.rb +146 -16
  44. data/lib/parser/source/tree_rewriter/action.rb +137 -28
  45. data/lib/parser/static_environment.rb +14 -0
  46. data/lib/parser/tree_rewriter.rb +3 -3
  47. data/lib/parser/variables_stack.rb +36 -0
  48. data/lib/parser/version.rb +1 -1
  49. data/parser.gemspec +13 -21
  50. metadata +34 -98
  51. data/.gitignore +0 -32
  52. data/.travis.yml +0 -21
  53. data/.yardopts +0 -21
  54. data/CHANGELOG.md +0 -909
  55. data/CONTRIBUTING.md +0 -17
  56. data/Gemfile +0 -10
  57. data/README.md +0 -301
  58. data/Rakefile +0 -165
  59. data/doc/AST_FORMAT.md +0 -1718
  60. data/doc/CUSTOMIZATION.md +0 -37
  61. data/doc/INTERNALS.md +0 -21
  62. data/doc/css/.gitkeep +0 -0
  63. data/doc/css/common.css +0 -68
  64. data/lib/parser/lexer.rl +0 -2376
  65. data/lib/parser/macruby.y +0 -2198
  66. data/lib/parser/ruby18.y +0 -1934
  67. data/lib/parser/ruby19.y +0 -2175
  68. data/lib/parser/ruby20.y +0 -2353
  69. data/lib/parser/ruby21.y +0 -2357
  70. data/lib/parser/ruby22.y +0 -2364
  71. data/lib/parser/ruby23.y +0 -2370
  72. data/lib/parser/ruby24.y +0 -2395
  73. data/lib/parser/ruby25.y +0 -2392
  74. data/lib/parser/ruby26.y +0 -2392
  75. data/lib/parser/rubymotion.y +0 -2182
  76. data/test/bug_163/fixtures/input.rb +0 -5
  77. data/test/bug_163/fixtures/output.rb +0 -5
  78. data/test/bug_163/rewriter.rb +0 -20
  79. data/test/helper.rb +0 -52
  80. data/test/parse_helper.rb +0 -315
  81. data/test/racc_coverage_helper.rb +0 -133
  82. data/test/test_base.rb +0 -31
  83. data/test/test_current.rb +0 -27
  84. data/test/test_diagnostic.rb +0 -96
  85. data/test/test_diagnostic_engine.rb +0 -62
  86. data/test/test_encoding.rb +0 -99
  87. data/test/test_lexer.rb +0 -3537
  88. data/test/test_lexer_stack_state.rb +0 -78
  89. data/test/test_parse_helper.rb +0 -80
  90. data/test/test_parser.rb +0 -6968
  91. data/test/test_runner_rewrite.rb +0 -47
  92. data/test/test_source_buffer.rb +0 -162
  93. data/test/test_source_comment.rb +0 -36
  94. data/test/test_source_comment_associator.rb +0 -367
  95. data/test/test_source_map.rb +0 -15
  96. data/test/test_source_range.rb +0 -172
  97. data/test/test_source_rewriter.rb +0 -541
  98. data/test/test_source_rewriter_action.rb +0 -46
  99. data/test/test_source_tree_rewriter.rb +0 -173
  100. data/test/test_static_environment.rb +0 -45
  101. data/test/using_tree_rewriter/fixtures/input.rb +0 -3
  102. data/test/using_tree_rewriter/fixtures/output.rb +0 -3
  103. data/test/using_tree_rewriter/using_tree_rewriter.rb +0 -9
data/doc/CUSTOMIZATION.md DELETED
@@ -1,37 +0,0 @@
1
- # Customizing Parsers
2
-
3
- While the default setup of the parsers provided by this Gem should be suitable
4
- for most some developers might want to change parts of it. An example would be
5
- the use of a custom class for nodes instead of `Parser::AST::Node`.
6
-
7
- Customizing the AST is done by creating a custom builder class and passing it
8
- to the constructor method of a parser. The default setup comes down to the
9
- following:
10
-
11
- builder = Parser::Builders::Default.new
12
- parser = Parser::Ruby19.new(builder)
13
-
14
- When creating your own builder class it's best to subclass the default one so
15
- that you don't have to redefine every used method again:
16
-
17
- class MyBuilder < Parser::Builders::Default
18
-
19
- end
20
-
21
- builder = MyBuilder.new
22
- parser = Parser::Ruby19.new(builder)
23
-
24
- ## Custom Node Classes
25
-
26
- To use a custom node class you have to override the method
27
- `Parser::Builders::Default#n`:
28
-
29
- class MyBuilder < Parser::Builders::Default
30
- def n(type, children, location)
31
- return MyNodeClass.new(type, children, :location => location)
32
- end
33
- end
34
-
35
- Note that the used class (and corresponding instance) must be compatible with
36
- `Parser::AST::Node` so it's best to subclass it and override/add code where
37
- needed.
data/doc/INTERNALS.md DELETED
@@ -1,21 +0,0 @@
1
- Entry points
2
- ------------
3
-
4
- Parser should be kept as slim as possible. This includes not loading
5
- any potentially large files when they are likely to be unused in practice.
6
-
7
- Parser has five main (classes of) `require` entry points:
8
-
9
- * `require 'parser'`. Main entry point, requires all classes which
10
- are used across the entire library.
11
- * `require 'parser/rubyXX'`. Version-specific entry point. Can raise
12
- a NotImplementedError if current Ruby runtime is unable to parse the
13
- requested Ruby version.
14
- * `require 'parser/all'`. Requires all available parsers for released
15
- versions of Ruby. Can raise NotImplementedError.
16
- * `require 'parser/runner'`. Requires all the stuff which is useful for
17
- command-line tools but not otherwise.
18
- * `require 'parser/runner/X'`. Runner-specific entry point.
19
-
20
- All non-main entry points internally `require 'parser'`. Additionally, all
21
- runner-specific entry points internally `requre 'parser/runner'`.
data/doc/css/.gitkeep DELETED
File without changes
data/doc/css/common.css DELETED
@@ -1,68 +0,0 @@
1
- body
2
- {
3
- font-size: 14px;
4
- line-height: 1.6;
5
- margin: 0 auto;
6
- max-width: 960px;
7
- }
8
-
9
- p code
10
- {
11
- background: #f2f2f2;
12
- padding-left: 3px;
13
- padding-right: 3px;
14
- }
15
-
16
- pre.code
17
- {
18
- font-size: 13px;
19
- line-height: 1.4;
20
- }
21
-
22
- /**
23
- * YARD uses generic table styles, using a special class means those tables
24
- * don't get messed up.
25
- */
26
- .table
27
- {
28
- border: 1px solid #ccc;
29
- border-right: none;
30
- border-collapse: separate;
31
- border-spacing: 0;
32
- text-align: left;
33
- }
34
-
35
- .table.full
36
- {
37
- width: 100%;
38
- }
39
-
40
- .table .field_name
41
- {
42
- min-width: 160px;
43
- }
44
-
45
- .table thead tr th.no_sort:first-child
46
- {
47
- width: 25px;
48
- }
49
-
50
- .table thead tr th, .table tbody tr td
51
- {
52
- border-bottom: 1px solid #ccc;
53
- border-right: 1px solid #ccc;
54
- min-width: 20px;
55
- padding: 8px 5px;
56
- text-align: left;
57
- vertical-align: top;
58
- }
59
-
60
- .table tbody tr:last-child td
61
- {
62
- border-bottom: none;
63
- }
64
-
65
- .table tr:nth-child(odd) td
66
- {
67
- background: #f9f9f9;
68
- }
data/lib/parser/lexer.rl DELETED
@@ -1,2376 +0,0 @@
1
- %%machine lex; # % fix highlighting
2
-
3
- #
4
- # === BEFORE YOU START ===
5
- #
6
- # Read the Ruby Hacking Guide chapter 11, available in English at
7
- # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
8
- #
9
- # Remember two things about Ragel scanners:
10
- #
11
- # 1) Longest match wins.
12
- #
13
- # 2) If two matches have the same length, the first
14
- # in source code wins.
15
- #
16
- # General rules of making Ragel and Bison happy:
17
- #
18
- # * `p` (position) and `@te` contain the index of the character
19
- # they're pointing to ("current"), plus one. `@ts` contains the index
20
- # of the corresponding character. The code for extracting matched token is:
21
- #
22
- # @source_buffer.slice(@ts...@te)
23
- #
24
- # * If your input is `foooooooobar` and the rule is:
25
- #
26
- # 'f' 'o'+
27
- #
28
- # the result will be:
29
- #
30
- # foooooooobar
31
- # ^ ts=0 ^ p=te=9
32
- #
33
- # * A Ragel lexer action should not emit more than one token, unless
34
- # you know what you are doing.
35
- #
36
- # * All Ragel commands (fnext, fgoto, ...) end with a semicolon.
37
- #
38
- # * If an action emits the token and transitions to another state, use
39
- # these Ragel commands:
40
- #
41
- # emit($whatever)
42
- # fnext $next_state; fbreak;
43
- #
44
- # If you perform `fgoto` in an action which does not emit a token nor
45
- # rewinds the stream pointer, the parser's side-effectful,
46
- # context-sensitive lookahead actions will break in a hard to detect
47
- # and debug way.
48
- #
49
- # * If an action does not emit a token:
50
- #
51
- # fgoto $next_state;
52
- #
53
- # * If an action features lookbehind, i.e. matches characters with the
54
- # intent of passing them to another action:
55
- #
56
- # p = @ts - 1
57
- # fgoto $next_state;
58
- #
59
- # or, if the lookbehind consists of a single character:
60
- #
61
- # fhold; fgoto $next_state;
62
- #
63
- # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and
64
- # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
65
- # _will_ invoke the action `act`.
66
- #
67
- # e_something stands for "something with **e**mbedded action".
68
- #
69
- # * EOF is explicit and is matched by `c_eof`. If you want to introspect
70
- # the state of the lexer, add this rule to the state:
71
- #
72
- # c_eof => do_eof;
73
- #
74
- # * If you proceed past EOF, the lexer will complain:
75
- #
76
- # NoMethodError: undefined method `ord' for nil:NilClass
77
- #
78
-
79
- class Parser::Lexer
80
-
81
- %% write data nofinal;
82
- # %
83
-
84
- ESCAPES = {
85
- ?a.ord => "\a", ?b.ord => "\b", ?e.ord => "\e", ?f.ord => "\f",
86
- ?n.ord => "\n", ?r.ord => "\r", ?s.ord => "\s", ?t.ord => "\t",
87
- ?v.ord => "\v", ?\\.ord => "\\"
88
- }.freeze
89
-
90
- REGEXP_META_CHARACTERS = Regexp.union(*"\\$()*+.<>?[]^{|}".chars).freeze
91
-
92
- attr_reader :source_buffer
93
-
94
- attr_accessor :diagnostics
95
- attr_accessor :static_env
96
- attr_accessor :force_utf32
97
-
98
- attr_accessor :cond, :cmdarg, :in_kwarg
99
-
100
- attr_accessor :tokens, :comments
101
-
102
- def initialize(version)
103
- @version = version
104
- @static_env = nil
105
-
106
- @tokens = nil
107
- @comments = nil
108
-
109
- reset
110
- end
111
-
112
- def reset(reset_state=true)
113
- # Ragel state:
114
- if reset_state
115
- # Unit tests set state prior to resetting lexer.
116
- @cs = self.class.lex_en_line_begin
117
-
118
- @cond = StackState.new('cond')
119
- @cmdarg = StackState.new('cmdarg')
120
- @cond_stack = []
121
- @cmdarg_stack = []
122
- end
123
-
124
- @force_utf32 = false # Set to true by some tests
125
-
126
- @source_pts = nil # @source as a codepoint array
127
-
128
- @p = 0 # stream position (saved manually in #advance)
129
- @ts = nil # token start
130
- @te = nil # token end
131
- @act = 0 # next action
132
-
133
- @stack = [] # state stack
134
- @top = 0 # state stack top pointer
135
-
136
- # Lexer state:
137
- @token_queue = []
138
- @literal_stack = []
139
-
140
- @eq_begin_s = nil # location of last encountered =begin
141
- @sharp_s = nil # location of last encountered #
142
-
143
- @newline_s = nil # location of last encountered newline
144
-
145
- @num_base = nil # last numeric base
146
- @num_digits_s = nil # starting position of numeric digits
147
- @num_suffix_s = nil # starting position of numeric suffix
148
- @num_xfrm = nil # numeric suffix-induced transformation
149
-
150
- @escape_s = nil # starting position of current sequence
151
- @escape = nil # last escaped sequence, as string
152
-
153
- @herebody_s = nil # starting position of current heredoc line
154
-
155
- # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is
156
- # encountered after a matching closing parenthesis.
157
- @paren_nest = 0
158
- @lambda_stack = []
159
-
160
- # After encountering the closing line of <<~SQUIGGLY_HEREDOC,
161
- # we store the indentation level and give it out to the parser
162
- # on request. It is not possible to infer indentation level just
163
- # from the AST because escape sequences such as `\ ` or `\t` are
164
- # expanded inside the lexer, but count as non-whitespace for
165
- # indentation purposes.
166
- @dedent_level = nil
167
-
168
- # If the lexer is in `command state' (aka expr_value)
169
- # at the entry to #advance, it will transition to expr_cmdarg
170
- # instead of expr_arg at certain points.
171
- @command_state = false
172
-
173
- # True at the end of "def foo a:"
174
- @in_kwarg = false
175
-
176
- # State before =begin / =end block comment
177
- @cs_before_block_comment = self.class.lex_en_line_begin
178
- end
179
-
180
- def source_buffer=(source_buffer)
181
- @source_buffer = source_buffer
182
-
183
- if @source_buffer
184
- source = @source_buffer.source
185
-
186
- if source.encoding == Encoding::UTF_8
187
- @source_pts = source.unpack('U*')
188
- else
189
- @source_pts = source.unpack('C*')
190
- end
191
-
192
- if @source_pts[0] == 0xfeff
193
- # Skip byte order mark.
194
- @p = 1
195
- end
196
- else
197
- @source_pts = nil
198
- end
199
- end
200
-
201
- def encoding
202
- @source_buffer.source.encoding
203
- end
204
-
205
- LEX_STATES = {
206
- :line_begin => lex_en_line_begin,
207
- :expr_dot => lex_en_expr_dot,
208
- :expr_fname => lex_en_expr_fname,
209
- :expr_value => lex_en_expr_value,
210
- :expr_beg => lex_en_expr_beg,
211
- :expr_mid => lex_en_expr_mid,
212
- :expr_arg => lex_en_expr_arg,
213
- :expr_cmdarg => lex_en_expr_cmdarg,
214
- :expr_end => lex_en_expr_end,
215
- :expr_endarg => lex_en_expr_endarg,
216
- :expr_endfn => lex_en_expr_endfn,
217
- :expr_labelarg => lex_en_expr_labelarg,
218
-
219
- :interp_string => lex_en_interp_string,
220
- :interp_words => lex_en_interp_words,
221
- :plain_string => lex_en_plain_string,
222
- :plain_words => lex_en_plain_string,
223
- }
224
-
225
- def state
226
- LEX_STATES.invert.fetch(@cs, @cs)
227
- end
228
-
229
- def state=(state)
230
- @cs = LEX_STATES.fetch(state)
231
- end
232
-
233
- def push_cmdarg
234
- @cmdarg_stack.push(@cmdarg)
235
- @cmdarg = StackState.new("cmdarg.#{@cmdarg_stack.count}")
236
- end
237
-
238
- def pop_cmdarg
239
- @cmdarg = @cmdarg_stack.pop
240
- end
241
-
242
- def push_cond
243
- @cond_stack.push(@cond)
244
- @cond = StackState.new("cond.#{@cond_stack.count}")
245
- end
246
-
247
- def pop_cond
248
- @cond = @cond_stack.pop
249
- end
250
-
251
- def dedent_level
252
- # We erase @dedent_level as a precaution to avoid accidentally
253
- # using a stale value.
254
- dedent_level, @dedent_level = @dedent_level, nil
255
- dedent_level
256
- end
257
-
258
- # Return next token: [type, value].
259
- def advance
260
- if @token_queue.any?
261
- return @token_queue.shift
262
- end
263
-
264
- # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
265
- klass = self.class
266
- _lex_trans_keys = klass.send :_lex_trans_keys
267
- _lex_key_spans = klass.send :_lex_key_spans
268
- _lex_index_offsets = klass.send :_lex_index_offsets
269
- _lex_indicies = klass.send :_lex_indicies
270
- _lex_trans_targs = klass.send :_lex_trans_targs
271
- _lex_trans_actions = klass.send :_lex_trans_actions
272
- _lex_to_state_actions = klass.send :_lex_to_state_actions
273
- _lex_from_state_actions = klass.send :_lex_from_state_actions
274
- _lex_eof_trans = klass.send :_lex_eof_trans
275
-
276
- pe = @source_pts.size + 2
277
- p, eof = @p, pe
278
-
279
- @command_state = (@cs == klass.lex_en_expr_value ||
280
- @cs == klass.lex_en_line_begin)
281
-
282
- %% write exec;
283
- # %
284
-
285
- @p = p
286
-
287
- if @token_queue.any?
288
- @token_queue.shift
289
- elsif @cs == klass.lex_error
290
- [ false, [ '$error'.freeze, range(p - 1, p) ] ]
291
- else
292
- eof = @source_pts.size
293
- [ false, [ '$eof'.freeze, range(eof, eof) ] ]
294
- end
295
- end
296
-
297
- protected
298
-
299
- def eof_codepoint?(point)
300
- [0x04, 0x1a, 0x00].include? point
301
- end
302
-
303
- def version?(*versions)
304
- versions.include?(@version)
305
- end
306
-
307
- def stack_pop
308
- @top -= 1
309
- @stack[@top]
310
- end
311
-
312
- def encode_escape(ord)
313
- ord.chr.force_encoding(@source_buffer.source.encoding)
314
- end
315
-
316
- def tok(s = @ts, e = @te)
317
- @source_buffer.slice(s...e)
318
- end
319
-
320
- def range(s = @ts, e = @te)
321
- Parser::Source::Range.new(@source_buffer, s, e)
322
- end
323
-
324
- def emit(type, value = tok, s = @ts, e = @te)
325
- token = [ type, [ value, range(s, e) ] ]
326
-
327
- @token_queue.push(token)
328
-
329
- @tokens.push(token) if @tokens
330
-
331
- token
332
- end
333
-
334
- def emit_table(table, s = @ts, e = @te)
335
- value = tok(s, e)
336
-
337
- emit(table[value], value, s, e)
338
- end
339
-
340
- def emit_do(do_block=false)
341
- if @cond.active?
342
- emit(:kDO_COND, 'do'.freeze)
343
- elsif @cmdarg.active? || do_block
344
- emit(:kDO_BLOCK, 'do'.freeze)
345
- else
346
- emit(:kDO, 'do'.freeze)
347
- end
348
- end
349
-
350
- def arg_or_cmdarg
351
- if @command_state
352
- self.class.lex_en_expr_cmdarg
353
- else
354
- self.class.lex_en_expr_arg
355
- end
356
- end
357
-
358
- def emit_comment(s = @ts, e = @te)
359
- if @comments
360
- @comments.push(Parser::Source::Comment.new(range(s, e)))
361
- end
362
-
363
- if @tokens
364
- @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ])
365
- end
366
-
367
- nil
368
- end
369
-
370
- def diagnostic(type, reason, arguments=nil, location=range, highlights=[])
371
- @diagnostics.process(
372
- Parser::Diagnostic.new(type, reason, arguments, location, highlights))
373
- end
374
-
375
- #
376
- # === LITERAL STACK ===
377
- #
378
-
379
- def push_literal(*args)
380
- new_literal = Literal.new(self, *args)
381
- @literal_stack.push(new_literal)
382
- next_state_for_literal(new_literal)
383
- end
384
-
385
- def next_state_for_literal(literal)
386
- if literal.words? && literal.backslash_delimited?
387
- if literal.interpolate?
388
- self.class.lex_en_interp_backslash_delimited_words
389
- else
390
- self.class.lex_en_plain_backslash_delimited_words
391
- end
392
- elsif literal.words? && !literal.backslash_delimited?
393
- if literal.interpolate?
394
- self.class.lex_en_interp_words
395
- else
396
- self.class.lex_en_plain_words
397
- end
398
- elsif !literal.words? && literal.backslash_delimited?
399
- if literal.interpolate?
400
- self.class.lex_en_interp_backslash_delimited
401
- else
402
- self.class.lex_en_plain_backslash_delimited
403
- end
404
- else
405
- if literal.interpolate?
406
- self.class.lex_en_interp_string
407
- else
408
- self.class.lex_en_plain_string
409
- end
410
- end
411
- end
412
-
413
- def literal
414
- @literal_stack.last
415
- end
416
-
417
- def pop_literal
418
- old_literal = @literal_stack.pop
419
-
420
- @dedent_level = old_literal.dedent_level
421
-
422
- if old_literal.type == :tREGEXP_BEG
423
- # Fetch modifiers.
424
- self.class.lex_en_regexp_modifiers
425
- else
426
- self.class.lex_en_expr_end
427
- end
428
- end
429
-
430
- # Mapping of strings to parser tokens.
431
-
432
- PUNCTUATION = {
433
- '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE,
434
- '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS,
435
- '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE,
436
- '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA,
437
- ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2,
438
- '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK,
439
- '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH,
440
- ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP,
441
- '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE,
442
- '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH,
443
- '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ,
444
- '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ,
445
- '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ,
446
- '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ,
447
- '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET,
448
- '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2,
449
- '!@' => :tBANG, '&.' => :tANDDOT,
450
- }
451
-
452
- PUNCTUATION_BEGIN = {
453
- '&' => :tAMPER, '*' => :tSTAR, '**' => :tDSTAR,
454
- '+' => :tUPLUS, '-' => :tUMINUS, '::' => :tCOLON3,
455
- '(' => :tLPAREN, '{' => :tLBRACE, '[' => :tLBRACK,
456
- }
457
-
458
- KEYWORDS = {
459
- 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD,
460
- 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD,
461
- 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED,
462
- 'BEGIN' => :klBEGIN, 'END' => :klEND,
463
- }
464
-
465
- KEYWORDS_BEGIN = {
466
- 'if' => :kIF, 'unless' => :kUNLESS,
467
- 'while' => :kWHILE, 'until' => :kUNTIL,
468
- 'rescue' => :kRESCUE, 'defined?' => :kDEFINED,
469
- 'BEGIN' => :klBEGIN, 'END' => :klEND,
470
- }
471
-
472
- %w(class module def undef begin end then elsif else ensure case when
473
- for break next redo retry in do return yield super self nil true
474
- false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword|
475
- KEYWORDS_BEGIN[keyword] = KEYWORDS[keyword] = :"k#{keyword.upcase}"
476
- end
477
-
478
- %%{
479
- # %
480
-
481
- access @;
482
- getkey (@source_pts[p] || 0);
483
-
484
- # === CHARACTER CLASSES ===
485
- #
486
- # Pay close attention to the differences between c_any and any.
487
- # c_any does not include EOF and so will cause incorrect behavior
488
- # for machine subtraction (any-except rules) and default transitions
489
- # for scanners.
490
-
491
- action do_nl {
492
- # Record position of a newline for precise location reporting on tNL
493
- # tokens.
494
- #
495
- # This action is embedded directly into c_nl, as it is idempotent and
496
- # there are no cases when we need to skip it.
497
- @newline_s = p
498
- }
499
-
500
- c_nl = '\n' $ do_nl;
501
- c_space = [ \t\r\f\v];
502
- c_space_nl = c_space | c_nl;
503
-
504
- c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
505
- c_eol = c_nl | c_eof;
506
- c_any = any - c_eof;
507
-
508
- c_nl_zlen = c_nl | zlen;
509
- c_line = any - c_nl_zlen;
510
-
511
- c_unicode = c_any - 0x00..0x7f;
512
- c_upper = [A-Z];
513
- c_lower = [a-z_] | c_unicode;
514
- c_alpha = c_lower | c_upper;
515
- c_alnum = c_alpha | [0-9];
516
-
517
- action do_eof {
518
- # Sit at EOF indefinitely. #advance would return $eof each time.
519
- # This allows to feed the lexer more data if needed; this is only used
520
- # in tests.
521
- #
522
- # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
523
- # below. This is due to the fact that scanner state at EOF is observed
524
- # by tests, and encapsulating it in a rule would break the introspection.
525
- fhold; fbreak;
526
- }
527
-
528
- #
529
- # === TOKEN DEFINITIONS ===
530
- #
531
-
532
- # All operators are punctuation. There is more to punctuation
533
- # than just operators. Operators can be overridden by user;
534
- # punctuation can not.
535
-
536
- # A list of operators which are valid in the function name context, but
537
- # have different semantics in others.
538
- operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ;
539
-
540
- # A list of operators which can occur within an assignment shortcut (+ → +=).
541
- operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' |
542
- '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ;
543
-
544
- # A list of all user-definable operators not covered by groups above.
545
- operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' |
546
- '<' | '<=' | '>' | '>=' | '<=>' | '=>' ;
547
-
548
- # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace,
549
- # as they are ambiguous with interpolation `#{}` and should be counted.
550
- # These braces are not present in punctuation lists.
551
-
552
- # A list of punctuation which has different meaning when used at the
553
- # beginning of expression.
554
- punctuation_begin = '-' | '+' | '::' | '(' | '[' |
555
- '*' | '**' | '&' ;
556
-
557
- # A list of all punctuation except punctuation_begin.
558
- punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' |
559
- '::' | '?' | ':' | '.' | '..' | '...' ;
560
-
561
- # A list of keywords which have different meaning at the beginning of expression.
562
- keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ;
563
-
564
- # A list of keywords which accept an argument-like expression, i.e. have the
565
- # same post-processing as method calls or commands. Example: `yield 1`,
566
- # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function.
567
- keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ;
568
-
569
- # A list of keywords which accept a literal function name as an argument.
570
- keyword_with_fname = 'def' | 'undef' | 'alias' ;
571
-
572
- # A list of keywords which accept an expression after them.
573
- keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' |
574
- 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' |
575
- 'and' | 'or' ;
576
-
577
- # A list of keywords which accept a value, and treat the keywords from
578
- # `keyword_modifier` list as modifiers.
579
- keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ;
580
-
581
- # A list of keywords which do not accept an expression after them.
582
- keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' |
583
- 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' |
584
- '__LINE__' | '__ENCODING__';
585
-
586
- # All keywords.
587
- keyword = keyword_with_value | keyword_with_mid |
588
- keyword_with_end | keyword_with_arg |
589
- keyword_with_fname | keyword_modifier ;
590
-
591
- constant = c_upper c_alnum*;
592
- bareword = c_alpha c_alnum*;
593
-
594
- call_or_var = c_lower c_alnum*;
595
- class_var = '@@' bareword;
596
- instance_var = '@' bareword;
597
- global_var = '$'
598
- ( bareword | digit+
599
- | [`'+~*$&?!@/\\;,.=:<>"] # `
600
- | '-' c_alnum
601
- )
602
- ;
603
-
604
- # Ruby accepts (and fails on) variables with leading digit
605
- # in literal context, but not in unquoted symbol body.
606
- class_var_v = '@@' c_alnum+;
607
- instance_var_v = '@' c_alnum+;
608
-
609
- label = bareword [?!]? ':';
610
-
611
- #
612
- # === NUMERIC PARSING ===
613
- #
614
-
615
- int_hex = ( xdigit+ '_' )* xdigit* '_'? ;
616
- int_dec = ( digit+ '_' )* digit* '_'? ;
617
- int_bin = ( [01]+ '_' )* [01]* '_'? ;
618
-
619
- flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0';
620
- flo_frac = '.' ( digit+ '_' )* digit+;
621
- flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+;
622
-
623
- int_suffix =
624
- '' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars) } }
625
- | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
626
- | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, chars)) } }
627
- | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
628
- | 'if' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } }
629
- | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 6); p -= 6 } };
630
-
631
- flo_pow_suffix =
632
- '' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars)) } }
633
- | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Float(chars))) } }
634
- | 'if' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 2); p -= 2 } };
635
-
636
- flo_suffix =
637
- flo_pow_suffix
638
- | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } }
639
- | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } }
640
- | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 6); p -= 6 } };
641
-
642
- #
643
- # === ESCAPE SEQUENCE PARSING ===
644
- #
645
-
646
- # Escape parsing code is a Ragel pattern, not a scanner, and therefore
647
- # it shouldn't directly raise errors or perform other actions with side effects.
648
- # In reality this would probably just mess up error reporting in pathological
649
- # cases, through.
650
-
651
- # The amount of code required to parse \M\C stuff correctly is ridiculous.
652
-
653
- escaped_nl = "\\" c_nl;
654
-
655
- action unicode_points {
656
- @escape = ""
657
-
658
- codepoints = tok(@escape_s + 2, p - 1)
659
- codepoint_s = @escape_s + 2
660
-
661
- if @version < 24
662
- if codepoints.start_with?(" ") || codepoints.start_with?("\t")
663
- diagnostic :fatal, :invalid_unicode_escape, nil,
664
- range(@escape_s + 2, @escape_s + 3)
665
- end
666
-
667
- if spaces_p = codepoints.index(/[ \t]{2}/)
668
- diagnostic :fatal, :invalid_unicode_escape, nil,
669
- range(codepoint_s + spaces_p + 1, codepoint_s + spaces_p + 2)
670
- end
671
-
672
- if codepoints.end_with?(" ") || codepoints.end_with?("\t")
673
- diagnostic :fatal, :invalid_unicode_escape, nil, range(p - 1, p)
674
- end
675
- end
676
-
677
- codepoints.scan(/([0-9a-fA-F]+)|([ \t]+)/).each do |(codepoint_str, spaces)|
678
- if spaces
679
- codepoint_s += spaces.length
680
- else
681
- codepoint = codepoint_str.to_i(16)
682
-
683
- if codepoint >= 0x110000
684
- diagnostic :error, :unicode_point_too_large, nil,
685
- range(codepoint_s, codepoint_s + codepoint_str.length)
686
- break
687
- end
688
-
689
- @escape += codepoint.chr(Encoding::UTF_8)
690
- codepoint_s += codepoint_str.length
691
- end
692
- end
693
- }
694
-
695
- action unescape_char {
696
- codepoint = @source_pts[p - 1]
697
- if (@escape = ESCAPES[codepoint]).nil?
698
- @escape = encode_escape(@source_buffer.slice(p - 1))
699
- end
700
- }
701
-
702
- action invalid_complex_escape {
703
- diagnostic :fatal, :invalid_escape
704
- }
705
-
706
- action slash_c_char {
707
- @escape = encode_escape(@escape[0].ord & 0x9f)
708
- }
709
-
710
- action slash_m_char {
711
- @escape = encode_escape(@escape[0].ord | 0x80)
712
- }
713
-
714
- maybe_escaped_char = (
715
- '\\' c_any %unescape_char
716
- | ( c_any - [\\] ) % { @escape = @source_buffer.slice(p - 1).chr }
717
- );
718
-
719
- maybe_escaped_ctrl_char = ( # why?!
720
- '\\' c_any %unescape_char %slash_c_char
721
- | '?' % { @escape = "\x7f" }
722
- | ( c_any - [\\?] ) % { @escape = @source_buffer.slice(p - 1).chr } %slash_c_char
723
- );
724
-
725
- escape = (
726
- # \377
727
- [0-7]{1,3}
728
- % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) }
729
-
730
- # \xff
731
- | 'x' xdigit{1,2}
732
- % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) }
733
-
734
- # %q[\x]
735
- | 'x' ( c_any - xdigit )
736
- % {
737
- diagnostic :fatal, :invalid_hex_escape, nil, range(@escape_s - 1, p + 2)
738
- }
739
-
740
- # \u263a
741
- | 'u' xdigit{4}
742
- % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
743
-
744
- # \u123
745
- | 'u' xdigit{0,3}
746
- % {
747
- diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
748
- }
749
-
750
- # u{not hex} or u{}
751
- | 'u{' ( c_any - xdigit - [ \t}] )* '}'
752
- % {
753
- diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p)
754
- }
755
-
756
- # \u{ \t 123 \t 456 \t\t }
757
- | 'u{' [ \t]* ( xdigit{1,6} [ \t]+ )*
758
- (
759
- ( xdigit{1,6} [ \t]* '}'
760
- %unicode_points
761
- )
762
- |
763
- ( xdigit* ( c_any - xdigit - [ \t}] )+ '}'
764
- | ( c_any - [ \t}] )* c_eof
765
- | xdigit{7,}
766
- ) % {
767
- diagnostic :fatal, :unterminated_unicode, nil, range(p - 1, p)
768
- }
769
- )
770
-
771
- # \C-\a \cx
772
- | ( 'C-' | 'c' ) escaped_nl?
773
- maybe_escaped_ctrl_char
774
-
775
- # \M-a
776
- | 'M-' escaped_nl?
777
- maybe_escaped_char
778
- %slash_m_char
779
-
780
- # \C-\M-f \M-\cf \c\M-f
781
- | ( ( 'C-' | 'c' ) escaped_nl? '\\M-'
782
- | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl?
783
- maybe_escaped_ctrl_char
784
- %slash_m_char
785
-
786
- | 'C' c_any %invalid_complex_escape
787
- | 'M' c_any %invalid_complex_escape
788
- | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
789
-
790
- | ( c_any - [0-7xuCMc] ) %unescape_char
791
-
792
- | c_eof % {
793
- diagnostic :fatal, :escape_eof, nil, range(p - 1, p)
794
- }
795
- );
796
-
797
- # Use rules in form of `e_bs escape' when you need to parse a sequence.
798
- e_bs = '\\' % {
799
- @escape_s = p
800
- @escape = nil
801
- };
802
-
803
- #
804
- # === STRING AND HEREDOC PARSING ===
805
- #
806
-
807
- # Heredoc parsing is quite a complex topic. First, consider that heredocs
808
- # can be arbitrarily nested. For example:
809
- #
810
- # puts <<CODE
811
- # the result is: #{<<RESULT.inspect
812
- # i am a heredoc
813
- # RESULT
814
- # }
815
- # CODE
816
- #
817
- # which, incidentally, evaluates to:
818
- #
819
- # the result is: " i am a heredoc\n"
820
- #
821
- # To parse them, lexer refers to two kinds (remember, nested heredocs)
822
- # of positions in the input stream, namely heredoc_e
823
- # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
824
- #
825
- # heredoc_e is simply contained inside the corresponding Literal, and
826
- # when the heredoc is closed, the lexing is restarted from that position.
827
- #
828
- # @herebody_s is quite more complex. First, @herebody_s changes after each
829
- # heredoc line is lexed. This way, at '\n' tok(@herebody_s, @te) always
830
- # contains the current line, and also when a heredoc is started, @herebody_s
831
- # contains the position from which the heredoc will be lexed.
832
- #
833
- # Second, as (insanity) there are nested heredocs, we need to maintain a
834
- # stack of these positions. Each time #push_literal is called, it saves current
835
- # @heredoc_s to literal.saved_herebody_s, and after an interpolation (possibly
836
- # containing another heredocs) is closed, the previous value is restored.
837
-
838
- e_heredoc_nl = c_nl % {
839
- # After every heredoc was parsed, @herebody_s contains the
840
- # position of next token after all heredocs.
841
- if @herebody_s
842
- p = @herebody_s
843
- @herebody_s = nil
844
- end
845
- };
846
-
847
- action extend_string {
848
- string = tok
849
-
850
- # tLABEL_END is only possible in non-cond context on >= 2.2
851
- if @version >= 22 && !@cond.active?
852
- lookahead = @source_buffer.slice(@te...@te+2)
853
- end
854
-
855
- current_literal = literal
856
- if !current_literal.heredoc? &&
857
- (token = current_literal.nest_and_try_closing(string, @ts, @te, lookahead))
858
- if token[0] == :tLABEL_END
859
- p += 1
860
- pop_literal
861
- fnext expr_labelarg;
862
- else
863
- fnext *pop_literal;
864
- end
865
- fbreak;
866
- else
867
- current_literal.extend_string(string, @ts, @te)
868
- end
869
- }
870
-
871
- action extend_string_escaped {
872
- current_literal = literal
873
- # Get the first character after the backslash.
874
- escaped_char = @source_buffer.slice(@escape_s).chr
875
-
876
- if current_literal.munge_escape? escaped_char
877
- # If this particular literal uses this character as an opening
878
- # or closing delimiter, it is an escape sequence for that
879
- # particular character. Write it without the backslash.
880
-
881
- if current_literal.regexp? && REGEXP_META_CHARACTERS.match(escaped_char)
882
- # Regular expressions should include escaped delimiters in their
883
- # escaped form, except when the escaped character is
884
- # a closing delimiter but not a regexp metacharacter.
885
- #
886
- # The backslash itself cannot be used as a closing delimiter
887
- # at the same time as an escape symbol, but it is always munged,
888
- # so this branch also executes for the non-closing-delimiter case
889
- # for the backslash.
890
- current_literal.extend_string(tok, @ts, @te)
891
- else
892
- current_literal.extend_string(escaped_char, @ts, @te)
893
- end
894
- else
895
- # It does not. So this is an actual escape sequence, yay!
896
- if current_literal.regexp?
897
- # Regular expressions should include escape sequences in their
898
- # escaped form. On the other hand, escaped newlines are removed.
899
- current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
900
- elsif current_literal.heredoc? && escaped_char == "\n".freeze
901
- if current_literal.squiggly_heredoc?
902
- # Squiggly heredocs like
903
- # <<~-HERE
904
- # 1\
905
- # 2
906
- # HERE
907
- # treat '\' as a line continuation, but still dedent the body, so the heredoc above becomes "12\n".
908
- # This information is emitted as is, without escaping,
909
- # later this escape sequence (\\n) gets handled manually in the Lexer::Dedenter
910
- current_literal.extend_string(tok, @ts, @te)
911
- else
912
- # Plain heredocs also parse \\n as a line continuation,
913
- # but they don't need to know that there was originally a newline in the
914
- # code, so we escape it and emit as " 1 2\n"
915
- current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
916
- end
917
- else
918
- current_literal.extend_string(@escape || tok, @ts, @te)
919
- end
920
- end
921
- }
922
-
923
- # Extend a string with a newline or a EOF character.
924
- # As heredoc closing line can immediately precede EOF, this action
925
- # has to handle such case specially.
926
- action extend_string_eol {
927
- current_literal = literal
928
- if @te == pe
929
- diagnostic :fatal, :string_eof, nil,
930
- range(current_literal.str_s, current_literal.str_s + 1)
931
- end
932
-
933
- if current_literal.heredoc?
934
- line = tok(@herebody_s, @ts).gsub(/\r+$/, ''.freeze)
935
-
936
- if version?(18, 19, 20)
937
- # See ruby:c48b4209c
938
- line = line.gsub(/\r.*$/, ''.freeze)
939
- end
940
-
941
- # Try ending the heredoc with the complete most recently
942
- # scanned line. @herebody_s always refers to the start of such line.
943
- if current_literal.nest_and_try_closing(line, @herebody_s, @ts)
944
- # Adjust @herebody_s to point to the next line.
945
- @herebody_s = @te
946
-
947
- # Continue regular lexing after the heredoc reference (<<END).
948
- p = current_literal.heredoc_e - 1
949
- fnext *pop_literal; fbreak;
950
- else
951
- # Calculate indentation level for <<~HEREDOCs.
952
- current_literal.infer_indent_level(line)
953
-
954
- # Ditto.
955
- @herebody_s = @te
956
- end
957
- else
958
- # Try ending the literal with a newline.
959
- if current_literal.nest_and_try_closing(tok, @ts, @te)
960
- fnext *pop_literal; fbreak;
961
- end
962
-
963
- if @herebody_s
964
- # This is a regular literal intertwined with a heredoc. Like:
965
- #
966
- # p <<-foo+"1
967
- # bar
968
- # foo
969
- # 2"
970
- #
971
- # which, incidentally, evaluates to "bar\n1\n2".
972
- p = @herebody_s - 1
973
- @herebody_s = nil
974
- end
975
- end
976
-
977
- if current_literal.words? && !eof_codepoint?(@source_pts[p])
978
- current_literal.extend_space @ts, @te
979
- else
980
- # A literal newline is appended if the heredoc was _not_ closed
981
- # this time (see fbreak above). See also Literal#nest_and_try_closing
982
- # for rationale of calling #flush_string here.
983
- current_literal.extend_string tok, @ts, @te
984
- current_literal.flush_string
985
- end
986
- }
987
-
988
- action extend_string_space {
989
- literal.extend_space @ts, @te
990
- }
991
-
992
- #
993
- # === INTERPOLATION PARSING ===
994
- #
995
-
996
- # Interpolations with immediate variable names simply call into
997
- # the corresponding machine.
998
-
999
- interp_var = '#' ( global_var | class_var_v | instance_var_v );
1000
-
1001
- action extend_interp_var {
1002
- current_literal = literal
1003
- current_literal.flush_string
1004
- current_literal.extend_content
1005
-
1006
- emit(:tSTRING_DVAR, nil, @ts, @ts + 1)
1007
-
1008
- p = @ts
1009
- fcall expr_variable;
1010
- }
1011
-
1012
- # Interpolations with code blocks must match nested curly braces, as
1013
- # interpolation ending is ambiguous with a block ending. So, every
1014
- # opening and closing brace should be matched with e_[lr]brace rules,
1015
- # which automatically perform the counting.
1016
- #
1017
- # Note that interpolations can themselves be nested, so brace balance
1018
- # is tied to the innermost literal.
1019
- #
1020
- # Also note that literals themselves should not use e_[lr]brace rules
1021
- # when matching their opening and closing delimiters, as the amount of
1022
- # braces inside the characters of a string literal is independent.
1023
-
1024
- interp_code = '#{';
1025
-
1026
- e_lbrace = '{' % {
1027
- @cond.push(false); @cmdarg.push(false)
1028
-
1029
- current_literal = literal
1030
- if current_literal
1031
- current_literal.start_interp_brace
1032
- end
1033
- };
1034
-
1035
- e_rbrace = '}' % {
1036
- current_literal = literal
1037
- if current_literal
1038
- if current_literal.end_interp_brace_and_try_closing
1039
- if version?(18, 19)
1040
- emit(:tRCURLY, '}'.freeze, p - 1, p)
1041
- if @version < 24
1042
- @cond.lexpop
1043
- @cmdarg.lexpop
1044
- else
1045
- @cond.pop
1046
- @cmdarg.pop
1047
- end
1048
- else
1049
- emit(:tSTRING_DEND, '}'.freeze, p - 1, p)
1050
- end
1051
-
1052
- if current_literal.saved_herebody_s
1053
- @herebody_s = current_literal.saved_herebody_s
1054
- end
1055
-
1056
-
1057
- fhold;
1058
- fnext *next_state_for_literal(current_literal);
1059
- fbreak;
1060
- end
1061
- end
1062
- };
1063
-
1064
- action extend_interp_code {
1065
- current_literal = literal
1066
- current_literal.flush_string
1067
- current_literal.extend_content
1068
-
1069
- emit(:tSTRING_DBEG, '#{'.freeze)
1070
-
1071
- if current_literal.heredoc?
1072
- current_literal.saved_herebody_s = @herebody_s
1073
- @herebody_s = nil
1074
- end
1075
-
1076
- current_literal.start_interp_brace
1077
- fnext expr_value;
1078
- fbreak;
1079
- }
1080
-
1081
- # Actual string parsers are simply combined from the primitives defined
1082
- # above.
1083
-
1084
- interp_words := |*
1085
- interp_code => extend_interp_code;
1086
- interp_var => extend_interp_var;
1087
- e_bs escape => extend_string_escaped;
1088
- c_space+ => extend_string_space;
1089
- c_eol => extend_string_eol;
1090
- c_any => extend_string;
1091
- *|;
1092
-
1093
- interp_string := |*
1094
- interp_code => extend_interp_code;
1095
- interp_var => extend_interp_var;
1096
- e_bs escape => extend_string_escaped;
1097
- c_eol => extend_string_eol;
1098
- c_any => extend_string;
1099
- *|;
1100
-
1101
- plain_words := |*
1102
- e_bs c_any => extend_string_escaped;
1103
- c_space+ => extend_string_space;
1104
- c_eol => extend_string_eol;
1105
- c_any => extend_string;
1106
- *|;
1107
-
1108
- plain_string := |*
1109
- '\\' c_nl => extend_string_eol;
1110
- e_bs c_any => extend_string_escaped;
1111
- c_eol => extend_string_eol;
1112
- c_any => extend_string;
1113
- *|;
1114
-
1115
- interp_backslash_delimited := |*
1116
- interp_code => extend_interp_code;
1117
- interp_var => extend_interp_var;
1118
- c_eol => extend_string_eol;
1119
- c_any => extend_string;
1120
- *|;
1121
-
1122
- plain_backslash_delimited := |*
1123
- c_eol => extend_string_eol;
1124
- c_any => extend_string;
1125
- *|;
1126
-
1127
- interp_backslash_delimited_words := |*
1128
- interp_code => extend_interp_code;
1129
- interp_var => extend_interp_var;
1130
- c_space+ => extend_string_space;
1131
- c_eol => extend_string_eol;
1132
- c_any => extend_string;
1133
- *|;
1134
-
1135
- plain_backslash_delimited_words := |*
1136
- c_space+ => extend_string_space;
1137
- c_eol => extend_string_eol;
1138
- c_any => extend_string;
1139
- *|;
1140
-
1141
- regexp_modifiers := |*
1142
- [A-Za-z]+
1143
- => {
1144
- unknown_options = tok.scan(/[^imxouesn]/)
1145
- if unknown_options.any?
1146
- diagnostic :error, :regexp_options,
1147
- { :options => unknown_options.join }
1148
- end
1149
-
1150
- emit(:tREGEXP_OPT)
1151
- fnext expr_end;
1152
- fbreak;
1153
- };
1154
-
1155
- any
1156
- => {
1157
- emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1)
1158
- fhold;
1159
- fgoto expr_end;
1160
- };
1161
- *|;
1162
-
1163
- #
1164
- # === WHITESPACE HANDLING ===
1165
- #
1166
-
1167
- # Various contexts in Ruby allow various kinds of whitespace
1168
- # to be used. They are grouped to clarify the lexing machines
1169
- # and ease collection of comments.
1170
-
1171
- # A line of code with inline #comment at end is always equivalent
1172
- # to a line of code ending with just a newline, so an inline
1173
- # comment is deemed equivalent to non-newline whitespace
1174
- # (c_space character class).
1175
-
1176
- w_space =
1177
- c_space+
1178
- | '\\' e_heredoc_nl
1179
- ;
1180
-
1181
- w_comment =
1182
- '#' %{ @sharp_s = p - 1 }
1183
- # The (p == pe) condition compensates for added "\0" and
1184
- # the way Ragel handles EOF.
1185
- c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) }
1186
- ;
1187
-
1188
- w_space_comment =
1189
- w_space
1190
- | w_comment
1191
- ;
1192
-
1193
- # A newline in non-literal context always interoperates with
1194
- # here document logic and can always be escaped by a backslash,
1195
- # still interoperating with here document logic in the same way,
1196
- # yet being invisible to anything else.
1197
- #
1198
- # To demonstrate:
1199
- #
1200
- # foo = <<FOO \
1201
- # bar
1202
- # FOO
1203
- # + 2
1204
- #
1205
- # is equivalent to `foo = "bar\n" + 2`.
1206
-
1207
- w_newline =
1208
- e_heredoc_nl;
1209
-
1210
- w_any =
1211
- w_space
1212
- | w_comment
1213
- | w_newline
1214
- ;
1215
-
1216
-
1217
- #
1218
- # === EXPRESSION PARSING ===
1219
- #
1220
-
1221
- # These rules implement a form of manually defined lookahead.
1222
- # The default longest-match scanning does not work here due
1223
- # to sheer ambiguity.
1224
-
1225
- ambiguous_fid_suffix = # actual parsed
1226
- [?!] %{ tm = p } | # a? a?
1227
- [?!]'=' %{ tm = p - 2 } # a!=b a != b
1228
- ;
1229
-
1230
- ambiguous_ident_suffix = # actual parsed
1231
- ambiguous_fid_suffix |
1232
- '=' %{ tm = p } | # a= a=
1233
- '==' %{ tm = p - 2 } | # a==b a == b
1234
- '=~' %{ tm = p - 2 } | # a=~b a =~ b
1235
- '=>' %{ tm = p - 2 } | # a=>b a => b
1236
- '===' %{ tm = p - 3 } # a===b a === b
1237
- ;
1238
-
1239
- ambiguous_symbol_suffix = # actual parsed
1240
- ambiguous_ident_suffix |
1241
- '==>' %{ tm = p - 2 } # :a==>b :a= => b
1242
- ;
1243
-
1244
- # Ambiguous with 1.9 hash labels.
1245
- ambiguous_const_suffix = # actual parsed
1246
- '::' %{ tm = p - 2 } # A::B A :: B
1247
- ;
1248
-
1249
- # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding
1250
- # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
1251
-
1252
- e_lbrack = '[' % {
1253
- @cond.push(false); @cmdarg.push(false)
1254
- };
1255
-
1256
- # Ruby 1.9 lambdas require parentheses counting in order to
1257
- # emit correct opening kDO/tLBRACE.
1258
-
1259
- e_lparen = '(' % {
1260
- @cond.push(false); @cmdarg.push(false)
1261
-
1262
- @paren_nest += 1
1263
- };
1264
-
1265
- e_rparen = ')' % {
1266
- @paren_nest -= 1
1267
- };
1268
-
1269
- # Ruby is context-sensitive wrt/ local identifiers.
1270
- action local_ident {
1271
- emit(:tIDENTIFIER)
1272
-
1273
- if !@static_env.nil? && @static_env.declared?(tok)
1274
- fnext expr_endfn; fbreak;
1275
- else
1276
- fnext *arg_or_cmdarg; fbreak;
1277
- end
1278
- }
1279
-
1280
- # Variable lexing code is accessed from both expressions and
1281
- # string interpolation related code.
1282
- #
1283
- expr_variable := |*
1284
- global_var
1285
- => {
1286
- if tok =~ /^\$([1-9][0-9]*)$/
1287
- emit(:tNTH_REF, tok(@ts + 1).to_i)
1288
- elsif tok =~ /^\$([&`'+])$/
1289
- emit(:tBACK_REF)
1290
- else
1291
- emit(:tGVAR)
1292
- end
1293
-
1294
- fnext *stack_pop; fbreak;
1295
- };
1296
-
1297
- class_var_v
1298
- => {
1299
- if tok =~ /^@@[0-9]/
1300
- diagnostic :error, :cvar_name, { :name => tok }
1301
- end
1302
-
1303
- emit(:tCVAR)
1304
- fnext *stack_pop; fbreak;
1305
- };
1306
-
1307
- instance_var_v
1308
- => {
1309
- if tok =~ /^@[0-9]/
1310
- diagnostic :error, :ivar_name, { :name => tok }
1311
- end
1312
-
1313
- emit(:tIVAR)
1314
- fnext *stack_pop; fbreak;
1315
- };
1316
- *|;
1317
-
1318
- # Literal function name in definition (e.g. `def class`).
1319
- # Keywords are returned as their respective tokens; this is used
1320
- # to support singleton def `def self.foo`. Global variables are
1321
- # returned as `tGVAR`; this is used in global variable alias
1322
- # statements `alias $a $b`. Symbols are returned verbatim; this
1323
- # is used in `alias :a :"b#{foo}"` and `undef :a`.
1324
- #
1325
- # Transitions to `expr_endfn` afterwards.
1326
- #
1327
- expr_fname := |*
1328
- keyword
1329
- => { emit_table(KEYWORDS_BEGIN);
1330
- fnext expr_endfn; fbreak; };
1331
-
1332
- constant
1333
- => { emit(:tCONSTANT)
1334
- fnext expr_endfn; fbreak; };
1335
-
1336
- bareword [?=!]?
1337
- => { emit(:tIDENTIFIER)
1338
- fnext expr_endfn; fbreak; };
1339
-
1340
- global_var
1341
- => { p = @ts - 1
1342
- fnext expr_end; fcall expr_variable; };
1343
-
1344
- # If the handling was to be delegated to expr_end,
1345
- # these cases would transition to something else than
1346
- # expr_endfn, which is incorrect.
1347
- operator_fname |
1348
- operator_arithmetic |
1349
- operator_rest
1350
- => { emit_table(PUNCTUATION)
1351
- fnext expr_endfn; fbreak; };
1352
-
1353
- '::'
1354
- => { fhold; fhold; fgoto expr_end; };
1355
-
1356
- ':'
1357
- => { fhold; fgoto expr_beg; };
1358
-
1359
- '%s' c_any
1360
- => {
1361
- if version?(23)
1362
- type, delimiter = tok[0..-2], tok[-1].chr
1363
- fgoto *push_literal(type, delimiter, @ts);
1364
- else
1365
- p = @ts - 1
1366
- fgoto expr_end;
1367
- end
1368
- };
1369
-
1370
- w_any;
1371
-
1372
- c_any
1373
- => { fhold; fgoto expr_end; };
1374
-
1375
- c_eof => do_eof;
1376
- *|;
1377
-
1378
- # After literal function name in definition. Behaves like `expr_end`,
1379
- # but allows a tLABEL.
1380
- #
1381
- # Transitions to `expr_end` afterwards.
1382
- #
1383
- expr_endfn := |*
1384
- label ( any - ':' )
1385
- => { emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1386
- fhold; fnext expr_labelarg; fbreak; };
1387
-
1388
- w_space_comment;
1389
-
1390
- c_any
1391
- => { fhold; fgoto expr_end; };
1392
-
1393
- c_eof => do_eof;
1394
- *|;
1395
-
1396
- # Literal function name in method call (e.g. `a.class`).
1397
- #
1398
- # Transitions to `expr_arg` afterwards.
1399
- #
1400
- expr_dot := |*
1401
- constant
1402
- => { emit(:tCONSTANT)
1403
- fnext *arg_or_cmdarg; fbreak; };
1404
-
1405
- call_or_var
1406
- => { emit(:tIDENTIFIER)
1407
- fnext *arg_or_cmdarg; fbreak; };
1408
-
1409
- bareword ambiguous_fid_suffix
1410
- => { emit(:tFID, tok(@ts, tm), @ts, tm)
1411
- fnext *arg_or_cmdarg; p = tm - 1; fbreak; };
1412
-
1413
- # See the comment in `expr_fname`.
1414
- operator_fname |
1415
- operator_arithmetic |
1416
- operator_rest
1417
- => { emit_table(PUNCTUATION)
1418
- fnext expr_arg; fbreak; };
1419
-
1420
- w_any;
1421
-
1422
- c_any
1423
- => { fhold; fgoto expr_end; };
1424
-
1425
- c_eof => do_eof;
1426
- *|;
1427
-
1428
- # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space
1429
- # is consumed; the current expression is a command or method call.
1430
- #
1431
- expr_arg := |*
1432
- #
1433
- # COMMAND MODE SPECIFIC TOKENS
1434
- #
1435
-
1436
- # cmd (1 + 2)
1437
- # See below the rationale about expr_endarg.
1438
- w_space+ e_lparen
1439
- => {
1440
- if version?(18)
1441
- emit(:tLPAREN2, '('.freeze, @te - 1, @te)
1442
- fnext expr_value; fbreak;
1443
- else
1444
- emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1445
- fnext expr_beg; fbreak;
1446
- end
1447
- };
1448
-
1449
- # meth(1 + 2)
1450
- # Regular method call.
1451
- e_lparen
1452
- => { emit(:tLPAREN2, '('.freeze)
1453
- fnext expr_beg; fbreak; };
1454
-
1455
- # meth [...]
1456
- # Array argument. Compare with indexing `meth[...]`.
1457
- w_space+ e_lbrack
1458
- => { emit(:tLBRACK, '['.freeze, @te - 1, @te)
1459
- fnext expr_beg; fbreak; };
1460
-
1461
- # cmd {}
1462
- # Command: method call without parentheses.
1463
- w_space* e_lbrace
1464
- => {
1465
- if @lambda_stack.last == @paren_nest
1466
- @lambda_stack.pop
1467
- emit(:tLAMBEG, '{'.freeze, @te - 1, @te)
1468
- else
1469
- emit(:tLCURLY, '{'.freeze, @te - 1, @te)
1470
- end
1471
- fnext expr_value; fbreak;
1472
- };
1473
-
1474
- #
1475
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
1476
- #
1477
-
1478
- # a??
1479
- # Ternary operator
1480
- '?' c_space_nl
1481
- => {
1482
- # Unlike expr_beg as invoked in the next rule, do not warn
1483
- p = @ts - 1
1484
- fgoto expr_end;
1485
- };
1486
-
1487
- # a ?b, a? ?
1488
- # Character literal or ternary operator
1489
- w_space* '?'
1490
- => { fhold; fgoto expr_beg; };
1491
-
1492
- # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
1493
- # a /foo/ (but not "a / foo" or "a /=foo")
1494
- # a <<HEREDOC
1495
- w_space+ %{ tm = p }
1496
- ( [%/] ( c_any - c_space_nl - '=' ) # /
1497
- | '<<'
1498
- )
1499
- => {
1500
- if tok(tm, tm + 1) == '/'.freeze
1501
- # Ambiguous regexp literal.
1502
- diagnostic :warning, :ambiguous_literal, nil, range(tm, tm + 1)
1503
- end
1504
-
1505
- p = tm - 1
1506
- fgoto expr_beg;
1507
- };
1508
-
1509
- # x *1
1510
- # Ambiguous splat, kwsplat or block-pass.
1511
- w_space+ %{ tm = p } ( '+' | '-' | '*' | '&' | '**' )
1512
- => {
1513
- diagnostic :warning, :ambiguous_prefix, { :prefix => tok(tm, @te) },
1514
- range(tm, @te)
1515
-
1516
- p = tm - 1
1517
- fgoto expr_beg;
1518
- };
1519
-
1520
- # x ::Foo
1521
- # Ambiguous toplevel constant access.
1522
- w_space+ '::'
1523
- => { fhold; fhold; fgoto expr_beg; };
1524
-
1525
- # x:b
1526
- # Symbol.
1527
- w_space* ':'
1528
- => { fhold; fgoto expr_beg; };
1529
-
1530
- w_space+ label
1531
- => { p = @ts - 1; fgoto expr_beg; };
1532
-
1533
- #
1534
- # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
1535
- #
1536
-
1537
- # a ? b
1538
- # Ternary operator.
1539
- w_space+ %{ tm = p } '?' c_space_nl
1540
- => { p = tm - 1; fgoto expr_end; };
1541
-
1542
- # x + 1: Binary operator or operator-assignment.
1543
- w_space* operator_arithmetic
1544
- ( '=' | c_space_nl )? |
1545
- # x rescue y: Modifier keyword.
1546
- w_space* keyword_modifier |
1547
- # a &. b: Safe navigation operator.
1548
- w_space* '&.' |
1549
- # Miscellanea.
1550
- w_space* punctuation_end
1551
- => {
1552
- p = @ts - 1
1553
- fgoto expr_end;
1554
- };
1555
-
1556
- w_space;
1557
-
1558
- w_comment
1559
- => { fgoto expr_end; };
1560
-
1561
- w_newline
1562
- => { fhold; fgoto expr_end; };
1563
-
1564
- c_any
1565
- => { fhold; fgoto expr_beg; };
1566
-
1567
- c_eof => do_eof;
1568
- *|;
1569
-
1570
- # The previous token was an identifier which was seen while in the
1571
- # command mode (that is, the state at the beginning of #advance was
1572
- # expr_value). This state is very similar to expr_arg, but disambiguates
1573
- # two very rare and specific condition:
1574
- # * In 1.8 mode, "foo (lambda do end)".
1575
- # * In 1.9+ mode, "f x: -> do foo do end end".
1576
- expr_cmdarg := |*
1577
- w_space+ e_lparen
1578
- => {
1579
- emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te)
1580
- if version?(18)
1581
- fnext expr_value; fbreak;
1582
- else
1583
- fnext expr_beg; fbreak;
1584
- end
1585
- };
1586
-
1587
- w_space* 'do'
1588
- => {
1589
- if @cond.active?
1590
- emit(:kDO_COND, 'do'.freeze, @te - 2, @te)
1591
- else
1592
- emit(:kDO, 'do'.freeze, @te - 2, @te)
1593
- end
1594
- fnext expr_value; fbreak;
1595
- };
1596
-
1597
- c_any |
1598
- # Disambiguate with the `do' rule above.
1599
- w_space* bareword |
1600
- w_space* label
1601
- => { p = @ts - 1
1602
- fgoto expr_arg; };
1603
-
1604
- c_eof => do_eof;
1605
- *|;
1606
-
1607
- # The rationale for this state is pretty complex. Normally, if an argument
1608
- # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1609
- # the block is attached to the innermost argument (`f` in `m f {}`), or it
1610
- # is a parse error (`m 1 {}`). But there is a special case for passing a single
1611
- # primary expression grouped with parentheses: if you write `m (1) {}` or
1612
- # (2.0 only) `m () {}`, then the block is attached to `m`.
1613
- #
1614
- # Thus, we recognize the opening `(` of a command (remember, a command is
1615
- # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize
1616
- # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the
1617
- # lexer's state to `expr_endarg`, which makes it emit the possibly following
1618
- # `{` as `tLBRACE_ARG`.
1619
- #
1620
- # The default post-`expr_endarg` state is `expr_end`, so this state also handles
1621
- # `do` (as `kDO_BLOCK` in `expr_beg`).
1622
- expr_endarg := |*
1623
- e_lbrace
1624
- => {
1625
- if @lambda_stack.last == @paren_nest
1626
- @lambda_stack.pop
1627
- emit(:tLAMBEG, '{'.freeze)
1628
- else
1629
- emit(:tLBRACE_ARG, '{'.freeze)
1630
- end
1631
- fnext expr_value;
1632
- };
1633
-
1634
- 'do'
1635
- => { emit_do(true)
1636
- fnext expr_value; fbreak; };
1637
-
1638
- w_space_comment;
1639
-
1640
- c_any
1641
- => { fhold; fgoto expr_end; };
1642
-
1643
- c_eof => do_eof;
1644
- *|;
1645
-
1646
- # The rationale for this state is that several keywords accept value
1647
- # (i.e. should transition to `expr_beg`), do not accept it like a command
1648
- # (i.e. not an `expr_arg`), and must behave like a statement, that is,
1649
- # accept a modifier if/while/etc.
1650
- #
1651
- expr_mid := |*
1652
- keyword_modifier
1653
- => { emit_table(KEYWORDS)
1654
- fnext expr_beg; fbreak; };
1655
-
1656
- bareword
1657
- => { p = @ts - 1; fgoto expr_beg; };
1658
-
1659
- w_space_comment;
1660
-
1661
- w_newline
1662
- => { fhold; fgoto expr_end; };
1663
-
1664
- c_any
1665
- => { fhold; fgoto expr_beg; };
1666
-
1667
- c_eof => do_eof;
1668
- *|;
1669
-
1670
- # Beginning of an expression.
1671
- #
1672
- # Don't fallthrough to this state from `c_any`; make sure to handle
1673
- # `c_space* c_nl` and let `expr_end` handle the newline.
1674
- # Otherwise code like `f\ndef x` gets glued together and the parser
1675
- # explodes.
1676
- #
1677
- expr_beg := |*
1678
- # +5, -5, - 5
1679
- [+\-] w_any* [0-9]
1680
- => {
1681
- emit(:tUNARY_NUM, tok(@ts, @ts + 1), @ts, @ts + 1)
1682
- fhold; fnext expr_end; fbreak;
1683
- };
1684
-
1685
- # splat *a
1686
- '*'
1687
- => { emit(:tSTAR, '*'.freeze)
1688
- fbreak; };
1689
-
1690
- #
1691
- # STRING AND REGEXP LITERALS
1692
- #
1693
-
1694
- # /regexp/oui
1695
- # /=/ (disambiguation with /=)
1696
- '/' c_any
1697
- => {
1698
- type = delimiter = tok[0].chr
1699
- fhold; fgoto *push_literal(type, delimiter, @ts);
1700
- };
1701
-
1702
- # %<string>
1703
- '%' ( any - [A-Za-z] )
1704
- => {
1705
- type, delimiter = @source_buffer.slice(@ts).chr, tok[-1].chr
1706
- fgoto *push_literal(type, delimiter, @ts);
1707
- };
1708
-
1709
- # %w(we are the people)
1710
- '%' [A-Za-z]+ c_any
1711
- => {
1712
- type, delimiter = tok[0..-2], tok[-1].chr
1713
- fgoto *push_literal(type, delimiter, @ts);
1714
- };
1715
-
1716
- '%' c_eof
1717
- => {
1718
- diagnostic :fatal, :string_eof, nil, range(@ts, @ts + 1)
1719
- };
1720
-
1721
- # Heredoc start.
1722
- # <<END | <<'END' | <<"END" | <<`END` |
1723
- # <<-END | <<-'END' | <<-"END" | <<-`END` |
1724
- # <<~END | <<~'END' | <<~"END" | <<~`END`
1725
- '<<' [~\-]?
1726
- ( '"' ( any - '"' )* '"'
1727
- | "'" ( any - "'" )* "'"
1728
- | "`" ( any - "`" )* "`"
1729
- | bareword ) % { heredoc_e = p }
1730
- c_line* c_nl % { new_herebody_s = p }
1731
- => {
1732
- tok(@ts, heredoc_e) =~ /^<<(-?)(~?)(["'`]?)(.*)\3$/m
1733
-
1734
- indent = !$1.empty? || !$2.empty?
1735
- dedent_body = !$2.empty?
1736
- type = $3.empty? ? '<<"'.freeze : ('<<'.freeze + $3)
1737
- delimiter = $4
1738
-
1739
- if @version >= 24
1740
- if delimiter.count("\n") > 0
1741
- if delimiter.end_with?("\n")
1742
- diagnostic :warning, :heredoc_id_ends_with_nl, nil, range(@ts, @ts + 1)
1743
- delimiter = delimiter.rstrip
1744
- else
1745
- diagnostic :fatal, :heredoc_id_has_newline, nil, range(@ts, @ts + 1)
1746
- end
1747
- end
1748
- end
1749
-
1750
- if dedent_body && version?(18, 19, 20, 21, 22)
1751
- emit(:tLSHFT, '<<'.freeze, @ts, @ts + 2)
1752
- p = @ts + 1
1753
- fnext expr_beg; fbreak;
1754
- else
1755
- fnext *push_literal(type, delimiter, @ts, heredoc_e, indent, dedent_body);
1756
-
1757
- @herebody_s ||= new_herebody_s
1758
- p = @herebody_s - 1
1759
- end
1760
- };
1761
-
1762
- #
1763
- # SYMBOL LITERALS
1764
- #
1765
-
1766
- # :&&, :||
1767
- ':' ('&&' | '||') => {
1768
- fhold; fhold;
1769
- emit(:tSYMBEG, tok(@ts, @ts + 1), @ts, @ts + 1)
1770
- fgoto expr_fname;
1771
- };
1772
-
1773
- # :"bar", :'baz'
1774
- ':' ['"] # '
1775
- => {
1776
- type, delimiter = tok, tok[-1].chr
1777
- fgoto *push_literal(type, delimiter, @ts);
1778
- };
1779
-
1780
- # :!@ is :!
1781
- # :~@ is :~
1782
- ':' [!~] '@'
1783
- => {
1784
- emit(:tSYMBOL, tok(@ts + 1, @ts + 2))
1785
- fnext expr_end; fbreak;
1786
- };
1787
-
1788
- ':' bareword ambiguous_symbol_suffix
1789
- => {
1790
- emit(:tSYMBOL, tok(@ts + 1, tm), @ts, tm)
1791
- p = tm - 1
1792
- fnext expr_end; fbreak;
1793
- };
1794
-
1795
- ':' ( bareword | global_var | class_var | instance_var |
1796
- operator_fname | operator_arithmetic | operator_rest )
1797
- => {
1798
- emit(:tSYMBOL, tok(@ts + 1), @ts)
1799
- fnext expr_end; fbreak;
1800
- };
1801
-
1802
- #
1803
- # AMBIGUOUS TERNARY OPERATOR
1804
- #
1805
-
1806
- # Character constant, like ?a, ?\n, ?\u1000, and so on
1807
- # Don't accept \u escape with multiple codepoints, like \u{1 2 3}
1808
- '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' ))
1809
- | (c_any - c_space_nl - e_bs) % { @escape = nil }
1810
- )
1811
- => {
1812
- value = @escape || tok(@ts + 1)
1813
-
1814
- if version?(18)
1815
- emit(:tINTEGER, value.getbyte(0))
1816
- else
1817
- emit(:tCHARACTER, value)
1818
- end
1819
-
1820
- fnext expr_end; fbreak;
1821
- };
1822
-
1823
- '?' c_space_nl
1824
- => {
1825
- escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
1826
- "\v" => '\v', "\f" => '\f' }[@source_buffer.slice(@ts + 1)]
1827
- diagnostic :warning, :invalid_escape_use, { :escape => escape }, range
1828
-
1829
- p = @ts - 1
1830
- fgoto expr_end;
1831
- };
1832
-
1833
- '?' c_eof
1834
- => {
1835
- diagnostic :fatal, :incomplete_escape, nil, range(@ts, @ts + 1)
1836
- };
1837
-
1838
- # f ?aa : b: Disambiguate with a character literal.
1839
- '?' [A-Za-z_] bareword
1840
- => {
1841
- p = @ts - 1
1842
- fgoto expr_end;
1843
- };
1844
-
1845
- #
1846
- # KEYWORDS AND PUNCTUATION
1847
- #
1848
-
1849
- # a({b=>c})
1850
- e_lbrace
1851
- => {
1852
- if @lambda_stack.last == @paren_nest
1853
- @lambda_stack.pop
1854
- emit(:tLAMBEG, '{'.freeze)
1855
- else
1856
- emit(:tLBRACE, '{'.freeze)
1857
- end
1858
- fbreak;
1859
- };
1860
-
1861
- # a([1, 2])
1862
- e_lbrack
1863
- => { emit(:tLBRACK, '['.freeze)
1864
- fbreak; };
1865
-
1866
- # a()
1867
- e_lparen
1868
- => { emit(:tLPAREN, '('.freeze)
1869
- fbreak; };
1870
-
1871
- # a(+b)
1872
- punctuation_begin
1873
- => { emit_table(PUNCTUATION_BEGIN)
1874
- fbreak; };
1875
-
1876
- # rescue Exception => e: Block rescue.
1877
- # Special because it should transition to expr_mid.
1878
- 'rescue' %{ tm = p } '=>'?
1879
- => { emit(:kRESCUE, 'rescue'.freeze, @ts, tm)
1880
- p = tm - 1
1881
- fnext expr_mid; fbreak; };
1882
-
1883
- # if a: Statement if.
1884
- keyword_modifier
1885
- => { emit_table(KEYWORDS_BEGIN)
1886
- fnext expr_value; fbreak; };
1887
-
1888
- #
1889
- # RUBY 1.9 HASH LABELS
1890
- #
1891
-
1892
- label ( any - ':' )
1893
- => {
1894
- fhold;
1895
-
1896
- if version?(18)
1897
- ident = tok(@ts, @te - 2)
1898
-
1899
- emit((@source_buffer.slice(@ts) =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
1900
- ident, @ts, @te - 2)
1901
- fhold; # continue as a symbol
1902
-
1903
- if !@static_env.nil? && @static_env.declared?(ident)
1904
- fnext expr_end;
1905
- else
1906
- fnext *arg_or_cmdarg;
1907
- end
1908
- else
1909
- emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1910
- fnext expr_labelarg;
1911
- end
1912
-
1913
- fbreak;
1914
- };
1915
-
1916
- #
1917
- # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION
1918
- #
1919
-
1920
- # foo= bar: Disambiguate with bareword rule below.
1921
- bareword ambiguous_ident_suffix |
1922
- # def foo: Disambiguate with bareword rule below.
1923
- keyword
1924
- => { p = @ts - 1
1925
- fgoto expr_end; };
1926
-
1927
- # a = 42; a [42]: Indexing.
1928
- # def a; end; a [42]: Array argument.
1929
- call_or_var
1930
- => local_ident;
1931
-
1932
- (call_or_var - keyword)
1933
- % { ident_tok = tok; ident_ts = @ts; ident_te = @te; }
1934
- w_space+ '('
1935
- => {
1936
- emit(:tIDENTIFIER, ident_tok, ident_ts, ident_te)
1937
- p = ident_te - 1
1938
-
1939
- if !@static_env.nil? && @static_env.declared?(ident_tok) && @version < 25
1940
- fnext expr_endfn;
1941
- else
1942
- fnext expr_cmdarg;
1943
- end
1944
- fbreak;
1945
- };
1946
-
1947
- #
1948
- # WHITESPACE
1949
- #
1950
-
1951
- w_any;
1952
-
1953
- e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
1954
- => {
1955
- p = @ts - 1
1956
- @cs_before_block_comment = @cs
1957
- fgoto line_begin;
1958
- };
1959
-
1960
- #
1961
- # DEFAULT TRANSITION
1962
- #
1963
-
1964
- # The following rules match most binary and all unary operators.
1965
- # Rules for binary operators provide better error reporting.
1966
- operator_arithmetic '=' |
1967
- operator_rest |
1968
- punctuation_end |
1969
- c_any
1970
- => { p = @ts - 1; fgoto expr_end; };
1971
-
1972
- c_eof => do_eof;
1973
- *|;
1974
-
1975
- # Special newline handling for "def a b:"
1976
- #
1977
- expr_labelarg := |*
1978
- w_space_comment;
1979
-
1980
- w_newline
1981
- => {
1982
- if @in_kwarg
1983
- fhold; fgoto expr_end;
1984
- else
1985
- fgoto line_begin;
1986
- end
1987
- };
1988
-
1989
- c_any
1990
- => { fhold; fgoto expr_beg; };
1991
-
1992
- c_eof => do_eof;
1993
- *|;
1994
-
1995
- # Like expr_beg, but no 1.9 label or 2.2 quoted label possible.
1996
- #
1997
- expr_value := |*
1998
- # a:b: a(:b), a::B, A::B
1999
- label (any - ':')
2000
- => { p = @ts - 1
2001
- fgoto expr_end; };
2002
-
2003
- # "bar", 'baz'
2004
- ['"] # '
2005
- => {
2006
- fgoto *push_literal(tok, tok, @ts);
2007
- };
2008
-
2009
- w_space_comment;
2010
-
2011
- w_newline
2012
- => { fgoto line_begin; };
2013
-
2014
- c_any
2015
- => { fhold; fgoto expr_beg; };
2016
-
2017
- c_eof => do_eof;
2018
- *|;
2019
-
2020
- expr_end := |*
2021
- #
2022
- # STABBY LAMBDA
2023
- #
2024
-
2025
- '->'
2026
- => {
2027
- emit(:tLAMBDA, '->'.freeze, @ts, @ts + 2)
2028
-
2029
- @lambda_stack.push @paren_nest
2030
- fnext expr_endfn; fbreak;
2031
- };
2032
-
2033
- e_lbrace | 'do'
2034
- => {
2035
- if @lambda_stack.last == @paren_nest
2036
- @lambda_stack.pop
2037
-
2038
- if tok == '{'.freeze
2039
- emit(:tLAMBEG, '{'.freeze)
2040
- else # 'do'
2041
- emit(:kDO_LAMBDA, 'do'.freeze)
2042
- end
2043
- else
2044
- if tok == '{'.freeze
2045
- emit(:tLCURLY, '{'.freeze)
2046
- else # 'do'
2047
- emit_do
2048
- end
2049
- end
2050
-
2051
- fnext expr_value; fbreak;
2052
- };
2053
-
2054
- #
2055
- # KEYWORDS
2056
- #
2057
-
2058
- keyword_with_fname
2059
- => { emit_table(KEYWORDS)
2060
- fnext expr_fname; fbreak; };
2061
-
2062
- 'class' w_any* '<<'
2063
- => { emit(:kCLASS, 'class'.freeze, @ts, @ts + 5)
2064
- emit(:tLSHFT, '<<'.freeze, @te - 2, @te)
2065
- fnext expr_value; fbreak; };
2066
-
2067
- # a if b:c: Syntax error.
2068
- keyword_modifier
2069
- => { emit_table(KEYWORDS)
2070
- fnext expr_beg; fbreak; };
2071
-
2072
- # elsif b:c: elsif b(:c)
2073
- keyword_with_value
2074
- => { emit_table(KEYWORDS)
2075
- fnext expr_value; fbreak; };
2076
-
2077
- keyword_with_mid
2078
- => { emit_table(KEYWORDS)
2079
- fnext expr_mid; fbreak; };
2080
-
2081
- keyword_with_arg
2082
- => {
2083
- emit_table(KEYWORDS)
2084
-
2085
- if version?(18) && tok == 'not'.freeze
2086
- fnext expr_beg; fbreak;
2087
- else
2088
- fnext expr_arg; fbreak;
2089
- end
2090
- };
2091
-
2092
- '__ENCODING__'
2093
- => {
2094
- if version?(18)
2095
- emit(:tIDENTIFIER)
2096
-
2097
- unless !@static_env.nil? && @static_env.declared?(tok)
2098
- fnext *arg_or_cmdarg;
2099
- end
2100
- else
2101
- emit(:k__ENCODING__, '__ENCODING__'.freeze)
2102
- end
2103
- fbreak;
2104
- };
2105
-
2106
- keyword_with_end
2107
- => { emit_table(KEYWORDS)
2108
- fbreak; };
2109
-
2110
- #
2111
- # NUMERIC LITERALS
2112
- #
2113
-
2114
- ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p } int_hex
2115
- | '0' [Dd] %{ @num_base = 10; @num_digits_s = p } int_dec
2116
- | '0' [Oo] %{ @num_base = 8; @num_digits_s = p } int_dec
2117
- | '0' [Bb] %{ @num_base = 2; @num_digits_s = p } int_bin
2118
- | [1-9] digit* '_'? %{ @num_base = 10; @num_digits_s = @ts } int_dec
2119
- | '0' digit* '_'? %{ @num_base = 8; @num_digits_s = @ts } int_dec
2120
- ) %{ @num_suffix_s = p } int_suffix
2121
- => {
2122
- digits = tok(@num_digits_s, @num_suffix_s)
2123
-
2124
- if digits.end_with? '_'.freeze
2125
- diagnostic :error, :trailing_in_number, { :character => '_'.freeze },
2126
- range(@te - 1, @te)
2127
- elsif digits.empty? && @num_base == 8 && version?(18)
2128
- # 1.8 did not raise an error on 0o.
2129
- digits = '0'.freeze
2130
- elsif digits.empty?
2131
- diagnostic :error, :empty_numeric
2132
- elsif @num_base == 8 && (invalid_idx = digits.index(/[89]/))
2133
- invalid_s = @num_digits_s + invalid_idx
2134
- diagnostic :error, :invalid_octal, nil,
2135
- range(invalid_s, invalid_s + 1)
2136
- end
2137
-
2138
- if version?(18, 19, 20)
2139
- emit(:tINTEGER, digits.to_i(@num_base), @ts, @num_suffix_s)
2140
- p = @num_suffix_s - 1
2141
- else
2142
- @num_xfrm.call(digits.to_i(@num_base))
2143
- end
2144
- fbreak;
2145
- };
2146
-
2147
- flo_frac flo_pow?
2148
- => {
2149
- diagnostic :error, :no_dot_digit_literal
2150
- };
2151
-
2152
- flo_int [eE]
2153
- => {
2154
- if version?(18, 19, 20)
2155
- diagnostic :error,
2156
- :trailing_in_number, { :character => tok(@te - 1, @te) },
2157
- range(@te - 1, @te)
2158
- else
2159
- emit(:tINTEGER, tok(@ts, @te - 1).to_i, @ts, @te - 1)
2160
- fhold; fbreak;
2161
- end
2162
- };
2163
-
2164
- flo_int flo_frac [eE]
2165
- => {
2166
- if version?(18, 19, 20)
2167
- diagnostic :error,
2168
- :trailing_in_number, { :character => tok(@te - 1, @te) },
2169
- range(@te - 1, @te)
2170
- else
2171
- emit(:tFLOAT, tok(@ts, @te - 1).to_f, @ts, @te - 1)
2172
- fhold; fbreak;
2173
- end
2174
- };
2175
-
2176
- flo_int
2177
- ( flo_frac? flo_pow %{ @num_suffix_s = p } flo_pow_suffix
2178
- | flo_frac %{ @num_suffix_s = p } flo_suffix
2179
- )
2180
- => {
2181
- digits = tok(@ts, @num_suffix_s)
2182
-
2183
- if version?(18, 19, 20)
2184
- emit(:tFLOAT, Float(digits), @ts, @num_suffix_s)
2185
- p = @num_suffix_s - 1
2186
- else
2187
- @num_xfrm.call(digits)
2188
- end
2189
- fbreak;
2190
- };
2191
-
2192
- #
2193
- # STRING AND XSTRING LITERALS
2194
- #
2195
-
2196
- # `echo foo`, "bar", 'baz'
2197
- '`' | ['"] # '
2198
- => {
2199
- type, delimiter = tok, tok[-1].chr
2200
- fgoto *push_literal(type, delimiter, @ts, nil, false, false, true);
2201
- };
2202
-
2203
- #
2204
- # CONSTANTS AND VARIABLES
2205
- #
2206
-
2207
- constant
2208
- => { emit(:tCONSTANT)
2209
- fnext *arg_or_cmdarg; fbreak; };
2210
-
2211
- constant ambiguous_const_suffix
2212
- => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
2213
- p = tm - 1; fbreak; };
2214
-
2215
- global_var | class_var_v | instance_var_v
2216
- => { p = @ts - 1; fcall expr_variable; };
2217
-
2218
- #
2219
- # METHOD CALLS
2220
- #
2221
-
2222
- '.' | '&.' | '::'
2223
- => { emit_table(PUNCTUATION)
2224
- fnext expr_dot; fbreak; };
2225
-
2226
- call_or_var
2227
- => local_ident;
2228
-
2229
- bareword ambiguous_fid_suffix
2230
- => {
2231
- if tm == @te
2232
- # Suffix was consumed, e.g. foo!
2233
- emit(:tFID)
2234
- else
2235
- # Suffix was not consumed, e.g. foo!=
2236
- emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
2237
- p = tm - 1
2238
- end
2239
- fnext expr_arg; fbreak;
2240
- };
2241
-
2242
- #
2243
- # OPERATORS
2244
- #
2245
-
2246
- # When '|', '~', '!', '=>' are used as operators
2247
- # they do not accept any symbols (or quoted labels) after.
2248
- # Other binary operators accept it.
2249
- ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' )
2250
- => {
2251
- emit_table(PUNCTUATION);
2252
- fnext expr_value; fbreak;
2253
- };
2254
-
2255
- ( e_lparen | '|' | '~' | '!' )
2256
- => { emit_table(PUNCTUATION)
2257
- fnext expr_beg; fbreak; };
2258
-
2259
- e_rbrace | e_rparen | ']'
2260
- => {
2261
- emit_table(PUNCTUATION)
2262
-
2263
- if @version < 24
2264
- @cond.lexpop
2265
- @cmdarg.lexpop
2266
- else
2267
- @cond.pop
2268
- @cmdarg.pop
2269
- end
2270
-
2271
- if tok == '}'.freeze || tok == ']'.freeze
2272
- if @version >= 25
2273
- fnext expr_end;
2274
- else
2275
- fnext expr_endarg;
2276
- end
2277
- else # )
2278
- # fnext expr_endfn; ?
2279
- end
2280
-
2281
- fbreak;
2282
- };
2283
-
2284
- operator_arithmetic '='
2285
- => { emit(:tOP_ASGN, tok(@ts, @te - 1))
2286
- fnext expr_beg; fbreak; };
2287
-
2288
- '?'
2289
- => { emit(:tEH, '?'.freeze)
2290
- fnext expr_value; fbreak; };
2291
-
2292
- e_lbrack
2293
- => { emit(:tLBRACK2, '['.freeze)
2294
- fnext expr_beg; fbreak; };
2295
-
2296
- punctuation_end
2297
- => { emit_table(PUNCTUATION)
2298
- fnext expr_beg; fbreak; };
2299
-
2300
- #
2301
- # WHITESPACE
2302
- #
2303
-
2304
- w_space_comment;
2305
-
2306
- w_newline
2307
- => { fgoto leading_dot; };
2308
-
2309
- ';'
2310
- => { emit(:tSEMI, ';'.freeze)
2311
- fnext expr_value; fbreak; };
2312
-
2313
- '\\' c_line {
2314
- diagnostic :error, :bare_backslash, nil, range(@ts, @ts + 1)
2315
- fhold;
2316
- };
2317
-
2318
- c_any
2319
- => {
2320
- diagnostic :fatal, :unexpected, { :character => tok.inspect[1..-2] }
2321
- };
2322
-
2323
- c_eof => do_eof;
2324
- *|;
2325
-
2326
- leading_dot := |*
2327
- # Insane leading dots:
2328
- # a #comment
2329
- # .b: a.b
2330
- c_space* %{ tm = p } ('.' | '&.')
2331
- => { p = tm - 1; fgoto expr_end; };
2332
-
2333
- any
2334
- => { emit(:tNL, nil, @newline_s, @newline_s + 1)
2335
- fhold; fnext line_begin; fbreak; };
2336
- *|;
2337
-
2338
- #
2339
- # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING ===
2340
- #
2341
-
2342
- line_comment := |*
2343
- '=end' c_line* c_nl_zlen
2344
- => {
2345
- emit_comment(@eq_begin_s, @te)
2346
- fgoto *@cs_before_block_comment;
2347
- };
2348
-
2349
- c_line* c_nl;
2350
-
2351
- c_line* zlen
2352
- => {
2353
- diagnostic :fatal, :embedded_document, nil,
2354
- range(@eq_begin_s, @eq_begin_s + '=begin'.length)
2355
- };
2356
- *|;
2357
-
2358
- line_begin := |*
2359
- w_any;
2360
-
2361
- '=begin' ( c_space | c_nl_zlen )
2362
- => { @eq_begin_s = @ts
2363
- fgoto line_comment; };
2364
-
2365
- '__END__' ( c_eol - zlen )
2366
- => { p = pe - 3 };
2367
-
2368
- c_any
2369
- => { fhold; fgoto expr_value; };
2370
-
2371
- c_eof => do_eof;
2372
- *|;
2373
-
2374
- }%%
2375
- # %
2376
- end