natalie_parser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +22 -0
  3. data/Dockerfile +26 -0
  4. data/Gemfile +10 -0
  5. data/LICENSE +21 -0
  6. data/README.md +55 -0
  7. data/Rakefile +242 -0
  8. data/ext/natalie_parser/extconf.rb +9 -0
  9. data/ext/natalie_parser/mri_creator.hpp +139 -0
  10. data/ext/natalie_parser/natalie_parser.cpp +144 -0
  11. data/include/natalie_parser/creator/debug_creator.hpp +113 -0
  12. data/include/natalie_parser/creator.hpp +108 -0
  13. data/include/natalie_parser/lexer/interpolated_string_lexer.hpp +64 -0
  14. data/include/natalie_parser/lexer/regexp_lexer.hpp +37 -0
  15. data/include/natalie_parser/lexer/word_array_lexer.hpp +57 -0
  16. data/include/natalie_parser/lexer.hpp +135 -0
  17. data/include/natalie_parser/node/alias_node.hpp +35 -0
  18. data/include/natalie_parser/node/arg_node.hpp +74 -0
  19. data/include/natalie_parser/node/array_node.hpp +34 -0
  20. data/include/natalie_parser/node/array_pattern_node.hpp +28 -0
  21. data/include/natalie_parser/node/assignment_node.hpp +34 -0
  22. data/include/natalie_parser/node/back_ref_node.hpp +28 -0
  23. data/include/natalie_parser/node/begin_block_node.hpp +25 -0
  24. data/include/natalie_parser/node/begin_node.hpp +52 -0
  25. data/include/natalie_parser/node/begin_rescue_node.hpp +47 -0
  26. data/include/natalie_parser/node/bignum_node.hpp +37 -0
  27. data/include/natalie_parser/node/block_node.hpp +55 -0
  28. data/include/natalie_parser/node/block_pass_node.hpp +33 -0
  29. data/include/natalie_parser/node/break_node.hpp +32 -0
  30. data/include/natalie_parser/node/call_node.hpp +85 -0
  31. data/include/natalie_parser/node/case_in_node.hpp +40 -0
  32. data/include/natalie_parser/node/case_node.hpp +52 -0
  33. data/include/natalie_parser/node/case_when_node.hpp +43 -0
  34. data/include/natalie_parser/node/class_node.hpp +39 -0
  35. data/include/natalie_parser/node/colon2_node.hpp +44 -0
  36. data/include/natalie_parser/node/colon3_node.hpp +34 -0
  37. data/include/natalie_parser/node/constant_node.hpp +26 -0
  38. data/include/natalie_parser/node/def_node.hpp +55 -0
  39. data/include/natalie_parser/node/defined_node.hpp +33 -0
  40. data/include/natalie_parser/node/encoding_node.hpp +26 -0
  41. data/include/natalie_parser/node/end_block_node.hpp +25 -0
  42. data/include/natalie_parser/node/evaluate_to_string_node.hpp +37 -0
  43. data/include/natalie_parser/node/false_node.hpp +23 -0
  44. data/include/natalie_parser/node/fixnum_node.hpp +36 -0
  45. data/include/natalie_parser/node/float_node.hpp +36 -0
  46. data/include/natalie_parser/node/hash_node.hpp +34 -0
  47. data/include/natalie_parser/node/hash_pattern_node.hpp +27 -0
  48. data/include/natalie_parser/node/identifier_node.hpp +123 -0
  49. data/include/natalie_parser/node/if_node.hpp +43 -0
  50. data/include/natalie_parser/node/infix_op_node.hpp +46 -0
  51. data/include/natalie_parser/node/interpolated_node.hpp +33 -0
  52. data/include/natalie_parser/node/interpolated_regexp_node.hpp +28 -0
  53. data/include/natalie_parser/node/interpolated_shell_node.hpp +22 -0
  54. data/include/natalie_parser/node/interpolated_string_node.hpp +31 -0
  55. data/include/natalie_parser/node/interpolated_symbol_key_node.hpp +18 -0
  56. data/include/natalie_parser/node/interpolated_symbol_node.hpp +28 -0
  57. data/include/natalie_parser/node/iter_node.hpp +45 -0
  58. data/include/natalie_parser/node/keyword_arg_node.hpp +25 -0
  59. data/include/natalie_parser/node/keyword_splat_node.hpp +38 -0
  60. data/include/natalie_parser/node/logical_and_node.hpp +40 -0
  61. data/include/natalie_parser/node/logical_or_node.hpp +40 -0
  62. data/include/natalie_parser/node/match_node.hpp +38 -0
  63. data/include/natalie_parser/node/module_node.hpp +32 -0
  64. data/include/natalie_parser/node/multiple_assignment_arg_node.hpp +32 -0
  65. data/include/natalie_parser/node/multiple_assignment_node.hpp +37 -0
  66. data/include/natalie_parser/node/next_node.hpp +37 -0
  67. data/include/natalie_parser/node/nil_node.hpp +23 -0
  68. data/include/natalie_parser/node/nil_sexp_node.hpp +23 -0
  69. data/include/natalie_parser/node/node.hpp +155 -0
  70. data/include/natalie_parser/node/node_with_args.hpp +47 -0
  71. data/include/natalie_parser/node/not_match_node.hpp +35 -0
  72. data/include/natalie_parser/node/not_node.hpp +37 -0
  73. data/include/natalie_parser/node/nth_ref_node.hpp +27 -0
  74. data/include/natalie_parser/node/op_assign_accessor_node.hpp +74 -0
  75. data/include/natalie_parser/node/op_assign_and_node.hpp +34 -0
  76. data/include/natalie_parser/node/op_assign_node.hpp +47 -0
  77. data/include/natalie_parser/node/op_assign_or_node.hpp +34 -0
  78. data/include/natalie_parser/node/pin_node.hpp +33 -0
  79. data/include/natalie_parser/node/range_node.hpp +52 -0
  80. data/include/natalie_parser/node/redo_node.hpp +20 -0
  81. data/include/natalie_parser/node/regexp_node.hpp +36 -0
  82. data/include/natalie_parser/node/retry_node.hpp +20 -0
  83. data/include/natalie_parser/node/return_node.hpp +34 -0
  84. data/include/natalie_parser/node/safe_call_node.hpp +31 -0
  85. data/include/natalie_parser/node/sclass_node.hpp +37 -0
  86. data/include/natalie_parser/node/self_node.hpp +23 -0
  87. data/include/natalie_parser/node/shadow_arg_node.hpp +40 -0
  88. data/include/natalie_parser/node/shell_node.hpp +32 -0
  89. data/include/natalie_parser/node/splat_node.hpp +39 -0
  90. data/include/natalie_parser/node/splat_value_node.hpp +32 -0
  91. data/include/natalie_parser/node/stabby_proc_node.hpp +29 -0
  92. data/include/natalie_parser/node/string_node.hpp +42 -0
  93. data/include/natalie_parser/node/super_node.hpp +44 -0
  94. data/include/natalie_parser/node/symbol_key_node.hpp +19 -0
  95. data/include/natalie_parser/node/symbol_node.hpp +30 -0
  96. data/include/natalie_parser/node/to_array_node.hpp +33 -0
  97. data/include/natalie_parser/node/true_node.hpp +23 -0
  98. data/include/natalie_parser/node/unary_op_node.hpp +41 -0
  99. data/include/natalie_parser/node/undef_node.hpp +31 -0
  100. data/include/natalie_parser/node/until_node.hpp +21 -0
  101. data/include/natalie_parser/node/while_node.hpp +52 -0
  102. data/include/natalie_parser/node/yield_node.hpp +29 -0
  103. data/include/natalie_parser/node.hpp +89 -0
  104. data/include/natalie_parser/parser.hpp +218 -0
  105. data/include/natalie_parser/token.hpp +842 -0
  106. data/include/tm/defer.hpp +34 -0
  107. data/include/tm/hashmap.hpp +826 -0
  108. data/include/tm/macros.hpp +16 -0
  109. data/include/tm/optional.hpp +223 -0
  110. data/include/tm/owned_ptr.hpp +186 -0
  111. data/include/tm/recursion_guard.hpp +156 -0
  112. data/include/tm/shared_ptr.hpp +259 -0
  113. data/include/tm/string.hpp +1447 -0
  114. data/include/tm/tests.hpp +78 -0
  115. data/include/tm/vector.hpp +796 -0
  116. data/lib/natalie_parser/sexp.rb +36 -0
  117. data/lib/natalie_parser/version.rb +5 -0
  118. data/lib/natalie_parser.rb +3 -0
  119. data/natalie_parser.gemspec +23 -0
  120. data/src/lexer/interpolated_string_lexer.cpp +88 -0
  121. data/src/lexer/regexp_lexer.cpp +95 -0
  122. data/src/lexer/word_array_lexer.cpp +134 -0
  123. data/src/lexer.cpp +1703 -0
  124. data/src/node/alias_node.cpp +11 -0
  125. data/src/node/assignment_node.cpp +33 -0
  126. data/src/node/begin_node.cpp +29 -0
  127. data/src/node/begin_rescue_node.cpp +33 -0
  128. data/src/node/class_node.cpp +22 -0
  129. data/src/node/interpolated_regexp_node.cpp +19 -0
  130. data/src/node/interpolated_shell_node.cpp +25 -0
  131. data/src/node/interpolated_string_node.cpp +111 -0
  132. data/src/node/interpolated_symbol_node.cpp +25 -0
  133. data/src/node/match_node.cpp +14 -0
  134. data/src/node/module_node.cpp +21 -0
  135. data/src/node/multiple_assignment_node.cpp +37 -0
  136. data/src/node/node.cpp +10 -0
  137. data/src/node/node_with_args.cpp +35 -0
  138. data/src/node/op_assign_node.cpp +36 -0
  139. data/src/node/string_node.cpp +33 -0
  140. data/src/parser.cpp +2972 -0
  141. data/src/token.cpp +27 -0
  142. metadata +186 -0
data/src/lexer.cpp ADDED
@@ -0,0 +1,1703 @@
1
+ #include <errno.h>
2
+ #include <limits>
3
+ #include <stdlib.h>
4
+
5
+ #include "natalie_parser/lexer.hpp"
6
+ #include "natalie_parser/lexer/interpolated_string_lexer.hpp"
7
+ #include "natalie_parser/lexer/regexp_lexer.hpp"
8
+ #include "natalie_parser/lexer/word_array_lexer.hpp"
9
+ #include "natalie_parser/token.hpp"
10
+
11
+ namespace NatalieParser {
12
+
13
+ SharedPtr<Vector<Token>> Lexer::tokens() {
14
+ SharedPtr<Vector<Token>> tokens = new Vector<Token> {};
15
+ bool skip_next_newline = false;
16
+ Token last_doc_token;
17
+ for (;;) {
18
+ auto token = next_token();
19
+ if (token.is_comment())
20
+ continue;
21
+
22
+ if (token.is_doc()) {
23
+ if (last_doc_token)
24
+ last_doc_token.literal_string()->append(*token.literal_string());
25
+ else
26
+ last_doc_token = token;
27
+ continue;
28
+ }
29
+
30
+ // get rid of newlines after certain tokens
31
+ if (skip_next_newline) {
32
+ if (token.is_newline())
33
+ continue;
34
+ else
35
+ skip_next_newline = false;
36
+ }
37
+
38
+ // get rid of newlines before certain tokens
39
+ while (token.can_follow_collapsible_newline() && !tokens->is_empty() && tokens->last().is_newline())
40
+ tokens->pop();
41
+
42
+ if (last_doc_token) {
43
+ if (token.can_have_doc()) {
44
+ token.set_doc(last_doc_token.literal_string());
45
+ last_doc_token = {};
46
+ } else if (!token.is_end_of_line()) {
47
+ last_doc_token = {};
48
+ }
49
+ }
50
+
51
+ tokens->push(token);
52
+
53
+ m_last_token = token;
54
+
55
+ if (token.is_eof())
56
+ return tokens;
57
+ if (!token.is_valid())
58
+ return tokens;
59
+ if (token.can_precede_collapsible_newline())
60
+ skip_next_newline = true;
61
+ };
62
+ TM_UNREACHABLE();
63
+ }
64
+
65
+ Token Lexer::next_token() {
66
+ if (m_nested_lexer) {
67
+ auto token = m_nested_lexer->next_token();
68
+ if (token.is_eof()) {
69
+ if (m_nested_lexer->alters_parent_cursor_position()) {
70
+ m_index = m_nested_lexer->m_index;
71
+ m_cursor_line = m_nested_lexer->m_cursor_line;
72
+ m_cursor_column = m_nested_lexer->m_cursor_column;
73
+ }
74
+ delete m_nested_lexer;
75
+ m_nested_lexer = nullptr;
76
+ } else {
77
+ return token;
78
+ }
79
+ }
80
+ m_whitespace_precedes = skip_whitespace();
81
+ m_token_line = m_cursor_line;
82
+ m_token_column = m_cursor_column;
83
+ return build_next_token();
84
+ }
85
+
86
+ bool is_identifier_char(char c) {
87
+ if (!c) return false;
88
+ return isalnum(c) || c == '_' || (unsigned int)c >= 128;
89
+ }
90
+
91
+ bool is_message_suffix(char c) {
92
+ if (!c) return false;
93
+ return c == '?' || c == '!';
94
+ }
95
+
96
+ bool is_identifier_char_or_message_suffix(char c) {
97
+ return is_identifier_char(c) || is_message_suffix(c);
98
+ }
99
+
100
+ bool Lexer::match(size_t bytes, const char *compare) {
101
+ if (m_index + bytes > m_size)
102
+ return false;
103
+ if (strncmp(compare, m_input->c_str() + m_index, bytes) == 0) {
104
+ if (m_index + bytes < m_size && is_identifier_char_or_message_suffix(m_input->at(m_index + bytes)))
105
+ return false;
106
+ advance(bytes);
107
+ return true;
108
+ }
109
+ return false;
110
+ }
111
+
112
+ void Lexer::advance() {
113
+ auto c = current_char();
114
+ m_index++;
115
+ if (c == '\n') {
116
+ m_cursor_line++;
117
+ m_cursor_column = 0;
118
+ } else {
119
+ m_cursor_column++;
120
+ }
121
+ }
122
+
123
+ void Lexer::advance(size_t bytes) {
124
+ for (size_t i = 0; i < bytes; i++) {
125
+ advance();
126
+ }
127
+ }
128
+
129
+ // NOTE: this does not work across lines
130
+ void Lexer::rewind(size_t bytes) {
131
+ current_char();
132
+ m_cursor_column -= bytes;
133
+ m_index -= bytes;
134
+ }
135
+
136
+ bool Lexer::skip_whitespace() {
137
+ bool whitespace_found = false;
138
+ char c = current_char();
139
+ while (c == ' ' || c == '\t' || (c == '\\' && peek() == '\n')) {
140
+ whitespace_found = true;
141
+ advance();
142
+ if (c == '\\') advance();
143
+ c = current_char();
144
+ }
145
+ return whitespace_found;
146
+ }
147
+
148
+ Token Lexer::build_next_token() {
149
+ if (m_index >= m_size)
150
+ return Token { Token::Type::Eof, m_file, m_cursor_line, m_cursor_column };
151
+ if (m_start_char && current_char() == m_start_char) {
152
+ m_pair_depth++;
153
+ } else if (m_stop_char && current_char() == m_stop_char) {
154
+ if (m_pair_depth == 0)
155
+ return Token { Token::Type::Eof, m_file, m_cursor_line, m_cursor_column };
156
+ m_pair_depth--;
157
+ } else if (m_index == 0 && current_char() == '\xEF') {
158
+ // UTF-8 BOM
159
+ advance(); // \xEF
160
+ if (current_char() == '\xBB') advance();
161
+ if (current_char() == '\xBF') advance();
162
+ }
163
+ Token token;
164
+ switch (current_char()) {
165
+ case '=': {
166
+ advance();
167
+ switch (current_char()) {
168
+ case '=': {
169
+ advance();
170
+ switch (current_char()) {
171
+ case '=': {
172
+ advance();
173
+ return Token { Token::Type::EqualEqualEqual, m_file, m_token_line, m_token_column };
174
+ }
175
+ default:
176
+ return Token { Token::Type::EqualEqual, m_file, m_token_line, m_token_column };
177
+ }
178
+ }
179
+ case '>':
180
+ advance();
181
+ return Token { Token::Type::HashRocket, m_file, m_token_line, m_token_column };
182
+ case '~':
183
+ advance();
184
+ return Token { Token::Type::Match, m_file, m_token_line, m_token_column };
185
+ default:
186
+ if (m_cursor_column == 1 && match(5, "begin")) {
187
+ SharedPtr<String> doc = new String("=begin");
188
+ char c = current_char();
189
+ do {
190
+ doc->append_char(c);
191
+ c = next();
192
+ } while (c && !(m_cursor_column == 0 && match(4, "=end")));
193
+ doc->append("=end\n");
194
+ return Token { Token::Type::Doc, doc, m_file, m_token_line, m_token_column };
195
+ }
196
+ auto token = Token { Token::Type::Equal, m_file, m_token_line, m_token_column };
197
+ token.set_whitespace_precedes(m_whitespace_precedes);
198
+ return token;
199
+ }
200
+ }
201
+ case '+':
202
+ advance();
203
+ switch (current_char()) {
204
+ case '=':
205
+ advance();
206
+ return Token { Token::Type::PlusEqual, m_file, m_token_line, m_token_column };
207
+ case '@':
208
+ if (m_last_token.is_def_keyword() || m_last_token.is_dot()) {
209
+ advance();
210
+ SharedPtr<String> lit = new String("+@");
211
+ return Token { Token::Type::BareName, lit, m_file, m_token_line, m_token_column };
212
+ } else {
213
+ return Token { Token::Type::Plus, m_file, m_token_line, m_token_column };
214
+ }
215
+ default:
216
+ return Token { Token::Type::Plus, m_file, m_token_line, m_token_column };
217
+ }
218
+ case '-':
219
+ advance();
220
+ switch (current_char()) {
221
+ case '>':
222
+ advance();
223
+ return Token { Token::Type::Arrow, m_file, m_token_line, m_token_column };
224
+ case '=':
225
+ advance();
226
+ return Token { Token::Type::MinusEqual, m_file, m_token_line, m_token_column };
227
+ case '@':
228
+ if (m_last_token.is_def_keyword() || m_last_token.is_dot()) {
229
+ advance();
230
+ SharedPtr<String> lit = new String("-@");
231
+ return Token { Token::Type::BareName, lit, m_file, m_token_line, m_token_column };
232
+ } else {
233
+ return Token { Token::Type::Minus, m_file, m_token_line, m_token_column };
234
+ }
235
+ default:
236
+ return Token { Token::Type::Minus, m_file, m_token_line, m_token_column };
237
+ }
238
+ case '*':
239
+ advance();
240
+ switch (current_char()) {
241
+ case '*':
242
+ advance();
243
+ switch (current_char()) {
244
+ case '=':
245
+ advance();
246
+ return Token { Token::Type::StarStarEqual, m_file, m_token_line, m_token_column };
247
+ default:
248
+ return Token { Token::Type::StarStar, m_file, m_token_line, m_token_column };
249
+ }
250
+ case '=':
251
+ advance();
252
+ return Token { Token::Type::StarEqual, m_file, m_token_line, m_token_column };
253
+ default:
254
+ return Token { Token::Type::Star, m_file, m_token_line, m_token_column };
255
+ }
256
+ case '/': {
257
+ advance();
258
+ if (!m_last_token)
259
+ return consume_regexp('/', '/');
260
+ switch (m_last_token.type()) {
261
+ case Token::Type::Comma:
262
+ case Token::Type::Doc:
263
+ case Token::Type::LBracket:
264
+ case Token::Type::LCurlyBrace:
265
+ case Token::Type::LParen:
266
+ case Token::Type::Match:
267
+ case Token::Type::Newline:
268
+ return consume_regexp('/', '/');
269
+ case Token::Type::DefKeyword:
270
+ return Token { Token::Type::Slash, m_file, m_token_line, m_token_column };
271
+ default: {
272
+ switch (current_char()) {
273
+ case ' ':
274
+ return Token { Token::Type::Slash, m_file, m_token_line, m_token_column };
275
+ case '=':
276
+ advance();
277
+ return Token { Token::Type::SlashEqual, m_file, m_token_line, m_token_column };
278
+ default:
279
+ if (m_whitespace_precedes) {
280
+ return consume_regexp('/', '/');
281
+ } else {
282
+ return Token { Token::Type::Slash, m_file, m_token_line, m_token_column };
283
+ }
284
+ }
285
+ }
286
+ }
287
+ }
288
+ case '%':
289
+ advance();
290
+ switch (current_char()) {
291
+ case '=':
292
+ advance();
293
+ return Token { Token::Type::PercentEqual, m_file, m_token_line, m_token_column };
294
+ case 'q':
295
+ switch (peek()) {
296
+ case '[':
297
+ advance(2);
298
+ return consume_single_quoted_string('[', ']');
299
+ case '{':
300
+ advance(2);
301
+ return consume_single_quoted_string('{', '}');
302
+ case '<':
303
+ advance(2);
304
+ return consume_single_quoted_string('<', '>');
305
+ case '(':
306
+ advance(2);
307
+ return consume_single_quoted_string('(', ')');
308
+ default: {
309
+ char c = peek();
310
+ if (char_can_be_string_or_regexp_delimiter(c)) {
311
+ advance(2);
312
+ return consume_single_quoted_string(c, c);
313
+ } else {
314
+ return Token { Token::Type::Percent, m_file, m_token_line, m_token_column };
315
+ }
316
+ }
317
+ }
318
+ case 'Q':
319
+ switch (peek()) {
320
+ case '[':
321
+ advance(2);
322
+ return consume_double_quoted_string('[', ']');
323
+ case '{':
324
+ advance(2);
325
+ return consume_double_quoted_string('{', '}');
326
+ case '<':
327
+ advance(2);
328
+ return consume_double_quoted_string('<', '>');
329
+ case '(':
330
+ advance(2);
331
+ return consume_double_quoted_string('(', ')');
332
+ default: {
333
+ char c = peek();
334
+ if (char_can_be_string_or_regexp_delimiter(c)) {
335
+ advance(2);
336
+ return consume_double_quoted_string(c, c);
337
+ } else {
338
+ return Token { Token::Type::Percent, m_file, m_token_line, m_token_column };
339
+ }
340
+ }
341
+ }
342
+ case 'r':
343
+ switch (peek()) {
344
+ case '[':
345
+ advance(2);
346
+ return consume_regexp('[', ']');
347
+ case '{':
348
+ advance(2);
349
+ return consume_regexp('{', '}');
350
+ case '(':
351
+ advance(2);
352
+ return consume_regexp('(', ')');
353
+ case '<':
354
+ advance(2);
355
+ return consume_regexp('<', '>');
356
+ default: {
357
+ char c = peek();
358
+ if (char_can_be_string_or_regexp_delimiter(c)) {
359
+ advance(2);
360
+ return consume_regexp(c, c);
361
+ } else {
362
+ return Token { Token::Type::Percent, m_file, m_token_line, m_token_column };
363
+ }
364
+ }
365
+ }
366
+ case 'x':
367
+ switch (peek()) {
368
+ case '/': {
369
+ advance(2);
370
+ return consume_double_quoted_string('/', '/', Token::Type::InterpolatedShellBegin, Token::Type::InterpolatedShellEnd);
371
+ }
372
+ case '[': {
373
+ advance(2);
374
+ return consume_double_quoted_string('[', ']', Token::Type::InterpolatedShellBegin, Token::Type::InterpolatedShellEnd);
375
+ }
376
+ case '{': {
377
+ advance(2);
378
+ return consume_double_quoted_string('{', '}', Token::Type::InterpolatedShellBegin, Token::Type::InterpolatedShellEnd);
379
+ }
380
+ case '(': {
381
+ advance(2);
382
+ return consume_double_quoted_string('(', ')', Token::Type::InterpolatedShellBegin, Token::Type::InterpolatedShellEnd);
383
+ }
384
+ default:
385
+ return Token { Token::Type::Percent, m_file, m_token_line, m_token_column };
386
+ }
387
+ case 'w':
388
+ switch (peek()) {
389
+ case '/':
390
+ case '|': {
391
+ char c = next();
392
+ advance();
393
+ return consume_quoted_array_without_interpolation(c, c, Token::Type::PercentLowerW);
394
+ }
395
+ case '[':
396
+ advance(2);
397
+ return consume_quoted_array_without_interpolation('[', ']', Token::Type::PercentLowerW);
398
+ case '{':
399
+ advance(2);
400
+ return consume_quoted_array_without_interpolation('{', '}', Token::Type::PercentLowerW);
401
+ case '<':
402
+ advance(2);
403
+ return consume_quoted_array_without_interpolation('<', '>', Token::Type::PercentLowerW);
404
+ case '(':
405
+ advance(2);
406
+ return consume_quoted_array_without_interpolation('(', ')', Token::Type::PercentLowerW);
407
+ default:
408
+ return Token { Token::Type::Percent, m_file, m_token_line, m_token_column };
409
+ }
410
+ case 'W':
411
+ switch (peek()) {
412
+ case '/':
413
+ case '|': {
414
+ char c = next();
415
+ advance();
416
+ return consume_quoted_array_with_interpolation(0, c, Token::Type::PercentUpperW);
417
+ }
418
+ case '[':
419
+ advance(2);
420
+ return consume_quoted_array_with_interpolation('[', ']', Token::Type::PercentUpperW);
421
+ case '{':
422
+ advance(2);
423
+ return consume_quoted_array_with_interpolation('{', '}', Token::Type::PercentUpperW);
424
+ case '<':
425
+ advance(2);
426
+ return consume_quoted_array_with_interpolation('<', '>', Token::Type::PercentUpperW);
427
+ case '(':
428
+ advance(2);
429
+ return consume_quoted_array_with_interpolation('(', ')', Token::Type::PercentUpperW);
430
+ default:
431
+ return Token { Token::Type::Percent, m_file, m_token_line, m_token_column };
432
+ }
433
+ case 'i':
434
+ switch (peek()) {
435
+ case '|':
436
+ case '/': {
437
+ char c = next();
438
+ advance();
439
+ return consume_quoted_array_without_interpolation(c, c, Token::Type::PercentLowerI);
440
+ }
441
+ case '[':
442
+ advance(2);
443
+ return consume_quoted_array_without_interpolation('[', ']', Token::Type::PercentLowerI);
444
+ case '{':
445
+ advance(2);
446
+ return consume_quoted_array_without_interpolation('{', '}', Token::Type::PercentLowerI);
447
+ case '<':
448
+ advance(2);
449
+ return consume_quoted_array_without_interpolation('<', '>', Token::Type::PercentLowerI);
450
+ case '(':
451
+ advance(2);
452
+ return consume_quoted_array_without_interpolation('(', ')', Token::Type::PercentLowerI);
453
+ default:
454
+ return Token { Token::Type::Percent, m_file, m_token_line, m_token_column };
455
+ }
456
+ case 'I':
457
+ switch (peek()) {
458
+ case '|':
459
+ case '/': {
460
+ char c = next();
461
+ advance();
462
+ return consume_quoted_array_with_interpolation(0, c, Token::Type::PercentUpperI);
463
+ }
464
+ case '[':
465
+ advance(2);
466
+ return consume_quoted_array_with_interpolation('[', ']', Token::Type::PercentUpperI);
467
+ case '{':
468
+ advance(2);
469
+ return consume_quoted_array_with_interpolation('{', '}', Token::Type::PercentUpperI);
470
+ case '<':
471
+ advance(2);
472
+ return consume_quoted_array_with_interpolation('<', '>', Token::Type::PercentUpperI);
473
+ case '(':
474
+ advance(2);
475
+ return consume_quoted_array_with_interpolation('(', ')', Token::Type::PercentUpperI);
476
+ default:
477
+ return Token { Token::Type::Percent, m_file, m_token_line, m_token_column };
478
+ }
479
+ case '[':
480
+ advance();
481
+ return consume_double_quoted_string('[', ']');
482
+ case '{':
483
+ advance();
484
+ return consume_double_quoted_string('{', '}');
485
+ case '<':
486
+ advance();
487
+ return consume_double_quoted_string('<', '>');
488
+ case '(':
489
+ if (m_last_token.type() == Token::Type::DefKeyword || m_last_token.type() == Token::Type::Dot) {
490
+ // It's a trap! This looks like a %(string) but it's a method def/call!
491
+ break;
492
+ }
493
+ advance();
494
+ return consume_double_quoted_string('(', ')');
495
+ default: {
496
+ auto c = current_char();
497
+ if (char_can_be_string_or_regexp_delimiter(c)) {
498
+ advance();
499
+ return consume_double_quoted_string(c, c);
500
+ }
501
+ break;
502
+ }
503
+ }
504
+ return Token { Token::Type::Percent, m_file, m_token_line, m_token_column };
505
+ case '!':
506
+ advance();
507
+ switch (current_char()) {
508
+ case '=':
509
+ advance();
510
+ return Token { Token::Type::NotEqual, m_file, m_token_line, m_token_column };
511
+ case '~':
512
+ advance();
513
+ return Token { Token::Type::NotMatch, m_file, m_token_line, m_token_column };
514
+ case '@':
515
+ if (m_last_token.is_def_keyword() || m_last_token.is_dot()) {
516
+ advance();
517
+ SharedPtr<String> lit = new String("!@");
518
+ return Token { Token::Type::BareName, lit, m_file, m_token_line, m_token_column };
519
+ } else {
520
+ return Token { Token::Type::Not, m_file, m_token_line, m_token_column };
521
+ }
522
+ default:
523
+ return Token { Token::Type::Not, m_file, m_token_line, m_token_column };
524
+ }
525
+ case '<':
526
+ advance();
527
+ switch (current_char()) {
528
+ case '<': {
529
+ advance();
530
+ switch (current_char()) {
531
+ case '~':
532
+ case '-': {
533
+ auto next = peek();
534
+ if (isalpha(next))
535
+ return consume_heredoc();
536
+ switch (next) {
537
+ case '_':
538
+ case '"':
539
+ case '`':
540
+ case '\'':
541
+ return consume_heredoc();
542
+ default:
543
+ return Token { Token::Type::LeftShift, m_file, m_token_line, m_token_column };
544
+ }
545
+ }
546
+ case '=':
547
+ advance();
548
+ return Token { Token::Type::LeftShiftEqual, m_file, m_token_line, m_token_column };
549
+ default:
550
+ if (!m_whitespace_precedes) {
551
+ if (token_is_first_on_line())
552
+ return consume_heredoc();
553
+ else if (m_last_token.can_precede_heredoc_that_looks_like_left_shift_operator())
554
+ return consume_heredoc();
555
+ else
556
+ return Token { Token::Type::LeftShift, m_file, m_token_line, m_token_column };
557
+ }
558
+ if (isalpha(current_char()))
559
+ return consume_heredoc();
560
+ switch (current_char()) {
561
+ case '_':
562
+ case '"':
563
+ case '`':
564
+ case '\'':
565
+ return consume_heredoc();
566
+ default:
567
+ return Token { Token::Type::LeftShift, m_file, m_token_line, m_token_column };
568
+ }
569
+ }
570
+ }
571
+ case '=':
572
+ advance();
573
+ switch (current_char()) {
574
+ case '>':
575
+ advance();
576
+ return Token { Token::Type::Comparison, m_file, m_token_line, m_token_column };
577
+ default:
578
+ return Token { Token::Type::LessThanOrEqual, m_file, m_token_line, m_token_column };
579
+ }
580
+ default:
581
+ return Token { Token::Type::LessThan, m_file, m_token_line, m_token_column };
582
+ }
583
+ case '>':
584
+ advance();
585
+ switch (current_char()) {
586
+ case '>':
587
+ advance();
588
+ switch (current_char()) {
589
+ case '=':
590
+ advance();
591
+ return Token { Token::Type::RightShiftEqual, m_file, m_token_line, m_token_column };
592
+ default:
593
+ return Token { Token::Type::RightShift, m_file, m_token_line, m_token_column };
594
+ }
595
+ case '=':
596
+ advance();
597
+ return Token { Token::Type::GreaterThanOrEqual, m_file, m_token_line, m_token_column };
598
+ default:
599
+ return Token { Token::Type::GreaterThan, m_file, m_token_line, m_token_column };
600
+ }
601
+ case '&':
602
+ advance();
603
+ switch (current_char()) {
604
+ case '&':
605
+ advance();
606
+ switch (current_char()) {
607
+ case '=':
608
+ advance();
609
+ return Token { Token::Type::AmpersandAmpersandEqual, m_file, m_token_line, m_token_column };
610
+ default:
611
+ return Token { Token::Type::AmpersandAmpersand, m_file, m_token_line, m_token_column };
612
+ }
613
+ case '=':
614
+ advance();
615
+ return Token { Token::Type::AmpersandEqual, m_file, m_token_line, m_token_column };
616
+ case '.':
617
+ advance();
618
+ return Token { Token::Type::SafeNavigation, m_file, m_token_line, m_token_column };
619
+ default:
620
+ return Token { Token::Type::Ampersand, m_file, m_token_line, m_token_column };
621
+ }
622
+ case '|':
623
+ advance();
624
+ switch (current_char()) {
625
+ case '|':
626
+ advance();
627
+ switch (current_char()) {
628
+ case '=':
629
+ advance();
630
+ return Token { Token::Type::PipePipeEqual, m_file, m_token_line, m_token_column };
631
+ default:
632
+ return Token { Token::Type::PipePipe, m_file, m_token_line, m_token_column };
633
+ }
634
+ case '=':
635
+ advance();
636
+ return Token { Token::Type::PipeEqual, m_file, m_token_line, m_token_column };
637
+ default:
638
+ return Token { Token::Type::Pipe, m_file, m_token_line, m_token_column };
639
+ }
640
+ case '^':
641
+ advance();
642
+ switch (current_char()) {
643
+ case '=':
644
+ advance();
645
+ return Token { Token::Type::CaretEqual, m_file, m_token_line, m_token_column };
646
+ default:
647
+ return Token { Token::Type::Caret, m_file, m_token_line, m_token_column };
648
+ }
649
+ case '~':
650
+ advance();
651
+ switch (current_char()) {
652
+ case '@':
653
+ if (m_last_token.is_def_keyword() || m_last_token.is_dot()) {
654
+ advance();
655
+ SharedPtr<String> lit = new String("~@");
656
+ return Token { Token::Type::BareName, lit, m_file, m_token_line, m_token_column };
657
+ } else {
658
+ return Token { Token::Type::Tilde, m_file, m_token_line, m_token_column };
659
+ }
660
+ default:
661
+ return Token { Token::Type::Tilde, m_file, m_token_line, m_token_column };
662
+ }
663
+ case '?': {
664
+ auto c = next();
665
+ if (isspace(c)) {
666
+ m_open_ternary = true;
667
+ return Token { Token::Type::TernaryQuestion, m_file, m_token_line, m_token_column };
668
+ } else {
669
+ advance();
670
+ if (c == '\\') {
671
+ auto buf = new String();
672
+ auto result = consume_escaped_byte(*buf);
673
+ if (!result.first)
674
+ return Token { result.second, current_char(), m_file, m_token_line, m_token_column };
675
+ return Token { Token::Type::String, buf, m_file, m_token_line, m_token_column };
676
+ } else {
677
+ return Token { Token::Type::String, c, m_file, m_token_line, m_token_column };
678
+ }
679
+ }
680
+ }
681
+ case ':': {
682
+ auto c = next();
683
+ if (c == ':') {
684
+ advance();
685
+ return Token { Token::Type::ConstantResolution, m_file, m_token_line, m_token_column };
686
+ } else if (m_last_token.type() == Token::Type::InterpolatedStringEnd && !m_whitespace_precedes && !m_open_ternary) {
687
+ return Token { Token::Type::InterpolatedStringSymbolKey, m_file, m_token_line, m_token_column };
688
+ } else if (c == '"') {
689
+ advance();
690
+ return consume_double_quoted_string('"', '"', Token::Type::InterpolatedSymbolBegin, Token::Type::InterpolatedSymbolEnd);
691
+ } else if (c == '\'') {
692
+ advance();
693
+ auto string = consume_single_quoted_string('\'', '\'');
694
+ return Token { Token::Type::Symbol, string.literal(), m_file, m_token_line, m_token_column };
695
+ } else if (isspace(c)) {
696
+ m_open_ternary = false;
697
+ auto token = Token { Token::Type::TernaryColon, m_file, m_token_line, m_token_column };
698
+ token.set_whitespace_precedes(m_whitespace_precedes);
699
+ return token;
700
+ } else {
701
+ return consume_symbol();
702
+ }
703
+ }
704
+ case '@':
705
+ switch (peek()) {
706
+ case '@': {
707
+ // kinda janky, but we gotta trick consume_word and then prepend the '@' back on the front
708
+ advance();
709
+ auto token = consume_word(Token::Type::ClassVariable);
710
+ token.set_literal(String::format("@{}", token.literal()));
711
+ return token;
712
+ }
713
+ default:
714
+ return consume_word(Token::Type::InstanceVariable);
715
+ }
716
+ case '$':
717
+ if (peek() == '&') {
718
+ advance(2);
719
+ return Token { Token::Type::BackRef, '&', m_file, m_token_line, m_token_column };
720
+ } else if (peek() >= '1' && peek() <= '9') {
721
+ return consume_nth_ref();
722
+ } else {
723
+ return consume_global_variable();
724
+ }
725
+ case '.':
726
+ advance();
727
+ switch (current_char()) {
728
+ case '.':
729
+ advance();
730
+ switch (current_char()) {
731
+ case '.':
732
+ advance();
733
+ return Token { Token::Type::DotDotDot, m_file, m_token_line, m_token_column };
734
+ default:
735
+ return Token { Token::Type::DotDot, m_file, m_token_line, m_token_column };
736
+ }
737
+ default:
738
+ return Token { Token::Type::Dot, m_file, m_token_line, m_token_column };
739
+ }
740
+ case '{':
741
+ advance();
742
+ return Token { Token::Type::LCurlyBrace, m_file, m_token_line, m_token_column };
743
+ case '[': {
744
+ advance();
745
+ switch (current_char()) {
746
+ case ']':
747
+ advance();
748
+ switch (current_char()) {
749
+ case '=':
750
+ advance();
751
+ return Token { Token::Type::LBracketRBracketEqual, m_file, m_token_line, m_token_column };
752
+ default:
753
+ auto token = Token { Token::Type::LBracketRBracket, m_file, m_token_line, m_token_column };
754
+ token.set_whitespace_precedes(m_whitespace_precedes);
755
+ return token;
756
+ }
757
+ default:
758
+ auto token = Token { Token::Type::LBracket, m_file, m_token_line, m_token_column };
759
+ token.set_whitespace_precedes(m_whitespace_precedes);
760
+ return token;
761
+ }
762
+ }
763
+ case '(': {
764
+ advance();
765
+ auto token = Token { Token::Type::LParen, m_file, m_token_line, m_token_column };
766
+ token.set_whitespace_precedes(m_whitespace_precedes);
767
+ return token;
768
+ }
769
+ case '}':
770
+ advance();
771
+ return Token { Token::Type::RCurlyBrace, m_file, m_token_line, m_token_column };
772
+ case ']':
773
+ advance();
774
+ return Token { Token::Type::RBracket, m_file, m_token_line, m_token_column };
775
+ case ')':
776
+ advance();
777
+ return Token { Token::Type::RParen, m_file, m_token_line, m_token_column };
778
+ case '\n': {
779
+ advance();
780
+ auto token = Token { Token::Type::Newline, m_file, m_token_line, m_token_column };
781
+ if (!m_heredoc_stack.is_empty()) {
782
+ auto new_index = m_heredoc_stack.last();
783
+ while (m_index < new_index)
784
+ advance();
785
+ m_heredoc_stack.clear();
786
+ }
787
+ return token;
788
+ }
789
+ case ';':
790
+ advance();
791
+ return Token { Token::Type::Semicolon, m_file, m_token_line, m_token_column };
792
+ case ',':
793
+ advance();
794
+ return Token { Token::Type::Comma, m_file, m_token_line, m_token_column };
795
+ case '"':
796
+ advance();
797
+ return consume_double_quoted_string('"', '"');
798
+ case '\'':
799
+ advance();
800
+ return consume_single_quoted_string('\'', '\'');
801
+ case '`': {
802
+ advance();
803
+ return consume_double_quoted_string('`', '`', Token::Type::InterpolatedShellBegin, Token::Type::InterpolatedShellEnd);
804
+ }
805
+ case '#':
806
+ if (token_is_first_on_line()) {
807
+ SharedPtr<String> doc = new String();
808
+ bool found_comment_marker = true;
809
+ char c = current_char();
810
+ while (c) {
811
+ if (!found_comment_marker) {
812
+ if (c == '#')
813
+ found_comment_marker = true;
814
+ else if (!isspace(c))
815
+ break;
816
+ }
817
+ if (c == '\n' || c == '\r') {
818
+ doc->append_char(c);
819
+ found_comment_marker = false;
820
+ } else if (found_comment_marker)
821
+ doc->append_char(c);
822
+ c = next();
823
+ }
824
+ return Token { Token::Type::Doc, doc, m_file, m_token_line, m_token_column };
825
+ } else {
826
+ char c;
827
+ do {
828
+ c = next();
829
+ } while (c && c != '\n' && c != '\r');
830
+ return Token { Token::Type::Comment, m_file, m_token_line, m_token_column };
831
+ }
832
+ case '0':
833
+ case '1':
834
+ case '2':
835
+ case '3':
836
+ case '4':
837
+ case '5':
838
+ case '6':
839
+ case '7':
840
+ case '8':
841
+ case '9': {
842
+ auto token = consume_numeric();
843
+ return token;
844
+ }
845
+ };
846
+
847
+ Token keyword_token;
848
+
849
+ if (!m_last_token.is_dot() && match(4, "self")) {
850
+ if (current_char() == '.')
851
+ keyword_token = { Token::Type::SelfKeyword, m_file, m_token_line, m_token_column };
852
+ else
853
+ rewind(4);
854
+ }
855
+
856
+ if (!m_last_token.is_dot() && !m_last_token.is_def_keyword()) {
857
+ if (match(12, "__ENCODING__"))
858
+ keyword_token = { Token::Type::ENCODINGKeyword, m_file, m_token_line, m_token_column };
859
+ else if (match(8, "__LINE__"))
860
+ keyword_token = { Token::Type::LINEKeyword, m_file, m_token_line, m_token_column };
861
+ else if (match(8, "__FILE__"))
862
+ keyword_token = { Token::Type::FILEKeyword, m_file, m_token_line, m_token_column };
863
+ else if (match(5, "BEGIN"))
864
+ keyword_token = { Token::Type::BEGINKeyword, m_file, m_token_line, m_token_column };
865
+ else if (match(3, "END"))
866
+ keyword_token = { Token::Type::ENDKeyword, m_file, m_token_line, m_token_column };
867
+ else if (match(5, "alias"))
868
+ keyword_token = { Token::Type::AliasKeyword, m_file, m_token_line, m_token_column };
869
+ else if (match(3, "and"))
870
+ keyword_token = { Token::Type::AndKeyword, m_file, m_token_line, m_token_column };
871
+ else if (match(5, "begin"))
872
+ keyword_token = { Token::Type::BeginKeyword, m_file, m_token_line, m_token_column };
873
+ else if (match(5, "break"))
874
+ keyword_token = { Token::Type::BreakKeyword, m_file, m_token_line, m_token_column };
875
+ else if (match(4, "case"))
876
+ keyword_token = { Token::Type::CaseKeyword, m_file, m_token_line, m_token_column };
877
+ else if (match(5, "class"))
878
+ keyword_token = { Token::Type::ClassKeyword, m_file, m_token_line, m_token_column };
879
+ else if (match(8, "defined?"))
880
+ keyword_token = { Token::Type::DefinedKeyword, m_file, m_token_line, m_token_column };
881
+ else if (match(3, "def"))
882
+ keyword_token = { Token::Type::DefKeyword, m_file, m_token_line, m_token_column };
883
+ else if (match(2, "do"))
884
+ keyword_token = { Token::Type::DoKeyword, m_file, m_token_line, m_token_column };
885
+ else if (match(4, "else"))
886
+ keyword_token = { Token::Type::ElseKeyword, m_file, m_token_line, m_token_column };
887
+ else if (match(5, "elsif"))
888
+ keyword_token = { Token::Type::ElsifKeyword, m_file, m_token_line, m_token_column };
889
+ else if (match(3, "end"))
890
+ keyword_token = { Token::Type::EndKeyword, m_file, m_token_line, m_token_column };
891
+ else if (match(6, "ensure"))
892
+ keyword_token = { Token::Type::EnsureKeyword, m_file, m_token_line, m_token_column };
893
+ else if (match(5, "false"))
894
+ keyword_token = { Token::Type::FalseKeyword, m_file, m_token_line, m_token_column };
895
+ else if (match(3, "for"))
896
+ keyword_token = { Token::Type::ForKeyword, m_file, m_token_line, m_token_column };
897
+ else if (match(2, "if"))
898
+ keyword_token = { Token::Type::IfKeyword, m_file, m_token_line, m_token_column };
899
+ else if (match(2, "in"))
900
+ keyword_token = { Token::Type::InKeyword, m_file, m_token_line, m_token_column };
901
+ else if (match(6, "module"))
902
+ keyword_token = { Token::Type::ModuleKeyword, m_file, m_token_line, m_token_column };
903
+ else if (match(4, "next"))
904
+ keyword_token = { Token::Type::NextKeyword, m_file, m_token_line, m_token_column };
905
+ else if (match(3, "nil"))
906
+ keyword_token = { Token::Type::NilKeyword, m_file, m_token_line, m_token_column };
907
+ else if (match(3, "not"))
908
+ keyword_token = { Token::Type::NotKeyword, m_file, m_token_line, m_token_column };
909
+ else if (match(2, "or"))
910
+ keyword_token = { Token::Type::OrKeyword, m_file, m_token_line, m_token_column };
911
+ else if (match(4, "redo"))
912
+ keyword_token = { Token::Type::RedoKeyword, m_file, m_token_line, m_token_column };
913
+ else if (match(6, "rescue"))
914
+ keyword_token = { Token::Type::RescueKeyword, m_file, m_token_line, m_token_column };
915
+ else if (match(5, "retry"))
916
+ keyword_token = { Token::Type::RetryKeyword, m_file, m_token_line, m_token_column };
917
+ else if (match(6, "return"))
918
+ keyword_token = { Token::Type::ReturnKeyword, m_file, m_token_line, m_token_column };
919
+ else if (match(4, "self"))
920
+ keyword_token = { Token::Type::SelfKeyword, m_file, m_token_line, m_token_column };
921
+ else if (match(5, "super"))
922
+ keyword_token = { Token::Type::SuperKeyword, m_file, m_token_line, m_token_column };
923
+ else if (match(4, "then"))
924
+ keyword_token = { Token::Type::ThenKeyword, m_file, m_token_line, m_token_column };
925
+ else if (match(4, "true"))
926
+ keyword_token = { Token::Type::TrueKeyword, m_file, m_token_line, m_token_column };
927
+ else if (match(5, "undef"))
928
+ keyword_token = { Token::Type::UndefKeyword, m_file, m_token_line, m_token_column };
929
+ else if (match(6, "unless"))
930
+ keyword_token = { Token::Type::UnlessKeyword, m_file, m_token_line, m_token_column };
931
+ else if (match(5, "until"))
932
+ keyword_token = { Token::Type::UntilKeyword, m_file, m_token_line, m_token_column };
933
+ else if (match(4, "when"))
934
+ keyword_token = { Token::Type::WhenKeyword, m_file, m_token_line, m_token_column };
935
+ else if (match(5, "while"))
936
+ keyword_token = { Token::Type::WhileKeyword, m_file, m_token_line, m_token_column };
937
+ else if (match(5, "yield"))
938
+ keyword_token = { Token::Type::YieldKeyword, m_file, m_token_line, m_token_column };
939
+ }
940
+
941
+ // if a colon comes next, it's not a keyword -- it's a symbol!
942
+ if (keyword_token && current_char() == ':' && peek() != ':' && !m_open_ternary) {
943
+ advance(); // :
944
+ auto name = keyword_token.type_value();
945
+ return Token { Token::Type::SymbolKey, name, m_file, m_token_line, m_token_column };
946
+ } else if (keyword_token) {
947
+ return keyword_token;
948
+ }
949
+
950
+ auto c = current_char();
951
+ if ((c >= 'a' && c <= 'z') || c == '_') {
952
+ return consume_bare_name();
953
+ } else if (c >= 'A' && c <= 'Z') {
954
+ return consume_constant();
955
+ } else {
956
+ auto buf = consume_non_whitespace();
957
+ auto token = Token { Token::Type::Invalid, buf, m_file, m_token_line, m_token_column };
958
+ return token;
959
+ }
960
+
961
+ TM_UNREACHABLE();
962
+ }
963
+
964
+ Token Lexer::consume_symbol() {
965
+ char c = current_char();
966
+ SharedPtr<String> buf = new String("");
967
+ auto gobble = [&buf, this](char c) -> char { buf->append_char(c); return next(); };
968
+ switch (c) {
969
+ case '@':
970
+ c = gobble(c);
971
+ if (c == '@') c = gobble(c);
972
+ do {
973
+ c = gobble(c);
974
+ } while (is_identifier_char(c));
975
+ break;
976
+ case '$':
977
+ c = gobble(c);
978
+ do {
979
+ c = gobble(c);
980
+ } while (is_identifier_char(c));
981
+ break;
982
+ case '~':
983
+ c = gobble(c);
984
+ if (c == '@') advance();
985
+ break;
986
+ case '+':
987
+ case '-': {
988
+ c = gobble(c);
989
+ if (c == '@') gobble(c);
990
+ break;
991
+ }
992
+ case '&':
993
+ case '|':
994
+ case '^':
995
+ case '%':
996
+ case '/': {
997
+ gobble(c);
998
+ break;
999
+ }
1000
+ case '*':
1001
+ c = gobble(c);
1002
+ if (c == '*')
1003
+ gobble(c);
1004
+ break;
1005
+ case '=':
1006
+ switch (peek()) {
1007
+ case '=':
1008
+ c = gobble(c);
1009
+ c = gobble(c);
1010
+ if (c == '=') gobble(c);
1011
+ break;
1012
+ case '~':
1013
+ c = gobble(c);
1014
+ gobble(c);
1015
+ break;
1016
+ default:
1017
+ return Token { Token::Type::Invalid, c, m_file, m_token_line, m_token_column };
1018
+ }
1019
+ break;
1020
+ case '!':
1021
+ c = gobble(c);
1022
+ switch (c) {
1023
+ case '=':
1024
+ case '~':
1025
+ case '@':
1026
+ gobble(c);
1027
+ default:
1028
+ break;
1029
+ }
1030
+ break;
1031
+ case '>':
1032
+ c = gobble(c);
1033
+ switch (c) {
1034
+ case '=':
1035
+ case '>':
1036
+ gobble(c);
1037
+ default:
1038
+ break;
1039
+ }
1040
+ break;
1041
+ case '<':
1042
+ c = gobble(c);
1043
+ switch (c) {
1044
+ case '=':
1045
+ c = gobble(c);
1046
+ if (c == '>') gobble(c);
1047
+ break;
1048
+ case '<':
1049
+ gobble(c);
1050
+ default:
1051
+ break;
1052
+ }
1053
+ break;
1054
+ case '[':
1055
+ if (peek() == ']') {
1056
+ c = gobble(c);
1057
+ c = gobble(c);
1058
+ if (c == '=') gobble(c);
1059
+ } else {
1060
+ return Token { Token::Type::Invalid, c, m_file, m_token_line, m_token_column };
1061
+ }
1062
+ break;
1063
+ default:
1064
+ do {
1065
+ c = gobble(c);
1066
+ } while (is_identifier_char(c));
1067
+ switch (c) {
1068
+ case '?':
1069
+ case '!':
1070
+ case '=':
1071
+ switch (peek()) {
1072
+ case '>':
1073
+ break;
1074
+ default:
1075
+ gobble(c);
1076
+ }
1077
+ default:
1078
+ break;
1079
+ }
1080
+ }
1081
+ return Token { Token::Type::Symbol, buf, m_file, m_token_line, m_token_column };
1082
+ }
1083
+
1084
+ Token Lexer::consume_word(Token::Type type) {
1085
+ char c = current_char();
1086
+ SharedPtr<String> buf = new String("");
1087
+ do {
1088
+ buf->append_char(c);
1089
+ c = next();
1090
+ } while (is_identifier_char(c));
1091
+ switch (c) {
1092
+ case '?':
1093
+ case '!':
1094
+ advance();
1095
+ buf->append_char(c);
1096
+ break;
1097
+ default:
1098
+ break;
1099
+ }
1100
+ return Token { type, buf, m_file, m_token_line, m_token_column };
1101
+ }
1102
+
1103
+ Token Lexer::consume_bare_name() {
1104
+ auto token = consume_word(Token::Type::BareName);
1105
+ auto c = current_char();
1106
+ if (c == ':' && peek() != ':' && m_last_token.can_precede_symbol_key()) {
1107
+ advance();
1108
+ token.set_type(Token::Type::SymbolKey);
1109
+ }
1110
+ return token;
1111
+ }
1112
+
1113
+ Token Lexer::consume_constant() {
1114
+ auto token = consume_word(Token::Type::Constant);
1115
+ auto c = current_char();
1116
+ if (c == ':' && peek() != ':' && m_last_token.can_precede_symbol_key()) {
1117
+ advance();
1118
+ token.set_type(Token::Type::SymbolKey);
1119
+ }
1120
+ return token;
1121
+ }
1122
+
1123
+ Token Lexer::consume_global_variable() {
1124
+ switch (peek()) {
1125
+ case '?':
1126
+ case '!':
1127
+ case '=':
1128
+ case '@':
1129
+ case '&':
1130
+ case '`':
1131
+ case '\'':
1132
+ case '"':
1133
+ case '+':
1134
+ case '/':
1135
+ case '\\':
1136
+ case ';':
1137
+ case '<':
1138
+ case '>':
1139
+ case '$':
1140
+ case '*':
1141
+ case '.':
1142
+ case ',':
1143
+ case ':':
1144
+ case '_':
1145
+ case '~': {
1146
+ advance();
1147
+ SharedPtr<String> buf = new String("$");
1148
+ buf->append_char(current_char());
1149
+ advance();
1150
+ return Token { Token::Type::GlobalVariable, buf, m_file, m_token_line, m_token_column };
1151
+ }
1152
+ case '-': {
1153
+ SharedPtr<String> buf = new String("$-");
1154
+ advance(2);
1155
+ buf->append_char(current_char());
1156
+ advance();
1157
+ return Token { Token::Type::GlobalVariable, buf, m_file, m_token_line, m_token_column };
1158
+ }
1159
+ default: {
1160
+ return consume_word(Token::Type::GlobalVariable);
1161
+ }
1162
+ }
1163
+ }
1164
+
1165
+ bool is_valid_heredoc(bool with_dash, SharedPtr<String> doc, String heredoc_name) {
1166
+ if (!doc->ends_with(heredoc_name))
1167
+ return false;
1168
+ if (doc->length() - heredoc_name.length() == 0)
1169
+ return true;
1170
+ auto prefix = (*doc)[doc->length() - heredoc_name.length() - 1];
1171
+ return with_dash ? isspace(prefix) : prefix == '\n';
1172
+ }
1173
+
1174
+ size_t get_heredoc_indent(SharedPtr<String> doc) {
1175
+ if (doc->is_empty())
1176
+ return 0;
1177
+ size_t heredoc_indent = std::numeric_limits<size_t>::max();
1178
+ size_t line_indent = 0;
1179
+ bool maybe_blank_line = true;
1180
+ for (size_t i = 0; i < doc->length(); i++) {
1181
+ char c = (*doc)[i];
1182
+ if (c == '\n') {
1183
+ if (!maybe_blank_line && line_indent < heredoc_indent)
1184
+ heredoc_indent = line_indent;
1185
+ line_indent = 0;
1186
+ maybe_blank_line = true;
1187
+ } else if (isspace(c)) {
1188
+ if (maybe_blank_line)
1189
+ line_indent++;
1190
+ } else {
1191
+ maybe_blank_line = false;
1192
+ }
1193
+ }
1194
+ return heredoc_indent;
1195
+ }
1196
+
1197
+ void dedent_heredoc(SharedPtr<String> &doc) {
1198
+ size_t heredoc_indent = get_heredoc_indent(doc);
1199
+ if (heredoc_indent == 0)
1200
+ return;
1201
+ SharedPtr<String> new_doc = new String("");
1202
+ size_t line_begin = 0;
1203
+ for (size_t i = 0; i < doc->length(); i++) {
1204
+ char c = (*doc)[i];
1205
+ if (c == '\n') {
1206
+ line_begin += heredoc_indent;
1207
+ if (line_begin < i)
1208
+ new_doc->append(doc->substring(line_begin, i - line_begin));
1209
+ new_doc->append_char('\n');
1210
+ line_begin = i + 1;
1211
+ }
1212
+ }
1213
+ doc = new_doc;
1214
+ }
1215
+
1216
+ Token Lexer::consume_heredoc() {
1217
+ bool with_dash = false;
1218
+ bool should_dedent = false;
1219
+ switch (current_char()) {
1220
+ case '-':
1221
+ advance();
1222
+ with_dash = true;
1223
+ break;
1224
+ case '~':
1225
+ advance();
1226
+ with_dash = true;
1227
+ should_dedent = true;
1228
+ break;
1229
+ }
1230
+
1231
+ auto begin_type = Token::Type::InterpolatedStringBegin;
1232
+ auto end_type = Token::Type::InterpolatedStringEnd;
1233
+ bool should_interpolate = true;
1234
+ char delimiter = 0;
1235
+ String heredoc_name = "";
1236
+ switch (current_char()) {
1237
+ case '"':
1238
+ delimiter = '"';
1239
+ break;
1240
+ case '`':
1241
+ begin_type = Token::Type::InterpolatedShellBegin;
1242
+ end_type = Token::Type::InterpolatedShellEnd;
1243
+ delimiter = '`';
1244
+ break;
1245
+ case '\'':
1246
+ should_interpolate = false;
1247
+ delimiter = '\'';
1248
+ break;
1249
+ default:
1250
+ delimiter = 0;
1251
+ }
1252
+
1253
+ if (delimiter) {
1254
+ char c = next();
1255
+ while (c != delimiter) {
1256
+ switch (c) {
1257
+ case '\n':
1258
+ case '\r':
1259
+ case 0:
1260
+ return Token { Token::Type::UnterminatedString, "heredoc identifier", m_file, m_token_line, m_token_column };
1261
+ default:
1262
+ heredoc_name.append_char(c);
1263
+ c = next();
1264
+ }
1265
+ }
1266
+ advance();
1267
+ } else {
1268
+ heredoc_name = String(consume_word(Token::Type::BareName).literal());
1269
+ }
1270
+
1271
+ SharedPtr<String> doc = new String("");
1272
+ size_t heredoc_index = m_index;
1273
+ auto get_char = [&heredoc_index, this]() { return (heredoc_index >= m_size) ? 0 : m_input->at(heredoc_index); };
1274
+
1275
+ if (m_heredoc_stack.is_empty()) {
1276
+ // start consuming the heredoc on the next line
1277
+ while (get_char() != '\n') {
1278
+ if (heredoc_index >= m_size)
1279
+ return Token { Token::Type::UnterminatedString, "heredoc", m_file, m_token_line, m_token_column };
1280
+ heredoc_index++;
1281
+ }
1282
+ heredoc_index++;
1283
+ } else {
1284
+ // start consuming the heredoc right after the last one
1285
+ heredoc_index = m_heredoc_stack.last();
1286
+ }
1287
+
1288
+ // consume the heredoc until we find the delimiter, either '\n' (if << was used) or any whitespace (if <<- was used) followed by "DELIM\n"
1289
+ for (;;) {
1290
+ if (heredoc_index >= m_size) {
1291
+ if (is_valid_heredoc(with_dash, doc, heredoc_name))
1292
+ break;
1293
+ return Token { Token::Type::UnterminatedString, doc, m_file, m_token_line, m_token_column };
1294
+ }
1295
+ char c = get_char();
1296
+ heredoc_index++;
1297
+ if (c == '\n' && is_valid_heredoc(with_dash, doc, heredoc_name))
1298
+ break;
1299
+ doc->append_char(c);
1300
+ }
1301
+
1302
+ // chop the delimiter and any trailing space off the string
1303
+ doc->truncate(doc->length() - heredoc_name.length());
1304
+ doc->strip_trailing_spaces();
1305
+
1306
+ if (should_dedent)
1307
+ dedent_heredoc(doc);
1308
+
1309
+ // We have to keep tokenizing on the line where the heredoc was started, and then jump to the line after the heredoc.
1310
+ // This index is used to jump to the end of the heredoc later.
1311
+ m_heredoc_stack.push(heredoc_index);
1312
+
1313
+ auto token = Token { Token::Type::String, doc, m_file, m_token_line, m_token_column };
1314
+
1315
+ if (should_interpolate) {
1316
+ m_nested_lexer = new InterpolatedStringLexer { *this, token, end_type };
1317
+ return Token { begin_type, m_file, m_token_line, m_token_column };
1318
+ }
1319
+
1320
+ return token;
1321
+ }
1322
+
1323
+ Token Lexer::consume_numeric() {
1324
+ SharedPtr<String> chars = new String;
1325
+ if (current_char() == '0') {
1326
+ switch (peek()) {
1327
+ case 'd':
1328
+ case 'D': {
1329
+ advance();
1330
+ char c = next();
1331
+ if (!isdigit(c))
1332
+ return Token { Token::Type::Invalid, c, m_file, m_cursor_line, m_cursor_column };
1333
+ do {
1334
+ chars->append_char(c);
1335
+ c = next();
1336
+ if (c == '_')
1337
+ c = next();
1338
+ } while (isdigit(c));
1339
+ return chars_to_fixnum_or_bignum_token(chars, 10, 0);
1340
+ }
1341
+ case 'o':
1342
+ case 'O': {
1343
+ chars->append_char('0');
1344
+ chars->append_char('o');
1345
+ advance();
1346
+ char c = next();
1347
+ if (!(c >= '0' && c <= '7'))
1348
+ return Token { Token::Type::Invalid, c, m_file, m_cursor_line, m_cursor_column };
1349
+ do {
1350
+ chars->append_char(c);
1351
+ c = next();
1352
+ if (c == '_')
1353
+ c = next();
1354
+ } while (c >= '0' && c <= '7');
1355
+ return chars_to_fixnum_or_bignum_token(chars, 8, 2);
1356
+ }
1357
+ case 'x':
1358
+ case 'X': {
1359
+ chars->append_char('0');
1360
+ chars->append_char('x');
1361
+ advance();
1362
+ char c = next();
1363
+ if (!isxdigit(c))
1364
+ return Token { Token::Type::Invalid, c, m_file, m_cursor_line, m_cursor_column };
1365
+ do {
1366
+ chars->append_char(c);
1367
+ c = next();
1368
+ if (c == '_')
1369
+ c = next();
1370
+ } while (isxdigit(c));
1371
+ return chars_to_fixnum_or_bignum_token(chars, 16, 2);
1372
+ }
1373
+ case 'b':
1374
+ case 'B': {
1375
+ chars->append_char('0');
1376
+ chars->append_char('b');
1377
+ advance();
1378
+ char c = next();
1379
+ if (c != '0' && c != '1')
1380
+ return Token { Token::Type::Invalid, c, m_file, m_cursor_line, m_cursor_column };
1381
+ do {
1382
+ chars->append_char(c);
1383
+ c = next();
1384
+ if (c == '_')
1385
+ c = next();
1386
+ } while (c == '0' || c == '1');
1387
+ return chars_to_fixnum_or_bignum_token(chars, 2, 2);
1388
+ }
1389
+ }
1390
+ }
1391
+ char c = current_char();
1392
+ do {
1393
+ chars->append_char(c);
1394
+ c = next();
1395
+ if (c == '_')
1396
+ c = next();
1397
+ } while (isdigit(c));
1398
+ if ((c == '.' && isdigit(peek())) || (c == 'e' || c == 'E'))
1399
+ return consume_numeric_as_float(chars);
1400
+ else
1401
+ return chars_to_fixnum_or_bignum_token(chars, 10, 0);
1402
+ }
1403
+
1404
+ const long long max_fixnum = std::numeric_limits<long long>::max() / 2; // 63 bits for MRI
1405
+
1406
+ Token Lexer::chars_to_fixnum_or_bignum_token(SharedPtr<String> chars, int base, int offset) {
1407
+ errno = 0;
1408
+ auto fixnum = strtoll(chars->c_str() + offset, nullptr, base);
1409
+ if (errno != 0 || fixnum > max_fixnum)
1410
+ return Token { Token::Type::Bignum, chars, m_file, m_token_line, m_token_column };
1411
+ else
1412
+ return Token { Token::Type::Fixnum, fixnum, m_file, m_token_line, m_token_column };
1413
+ }
1414
+
1415
+ Token Lexer::consume_numeric_as_float(SharedPtr<String> chars) {
1416
+ char c = current_char();
1417
+ if (c == '.') {
1418
+ chars->append_char('.');
1419
+ c = next();
1420
+ do {
1421
+ chars->append_char(c);
1422
+ c = next();
1423
+ if (c == '_')
1424
+ c = next();
1425
+ } while (isdigit(c));
1426
+ }
1427
+ if (c == 'e' || c == 'E') {
1428
+ chars->append_char('e');
1429
+ c = next();
1430
+ if (c == '-' || c == '+') {
1431
+ chars->append_char(c);
1432
+ c = next();
1433
+ }
1434
+ if (!isdigit(c))
1435
+ return Token { Token::Type::Invalid, c, m_file, m_cursor_line, m_cursor_column };
1436
+ do {
1437
+ chars->append_char(c);
1438
+ c = next();
1439
+ if (c == '_')
1440
+ c = next();
1441
+ } while (isdigit(c));
1442
+ }
1443
+ double dbl = atof(chars->c_str());
1444
+ return Token { Token::Type::Float, dbl, m_file, m_token_line, m_token_column };
1445
+ }
1446
+
1447
+ Token Lexer::consume_nth_ref() {
1448
+ char c = next();
1449
+ long long num = 0;
1450
+ do {
1451
+ num *= 10;
1452
+ num += c - '0';
1453
+ c = next();
1454
+ } while (isdigit(c));
1455
+ return Token { Token::Type::NthRef, num, m_file, m_token_line, m_token_column };
1456
+ }
1457
+
1458
+ long long Lexer::consume_hex_number(int max_length, bool allow_underscore) {
1459
+ char c = current_char();
1460
+ int length = 0;
1461
+ long long number = 0;
1462
+ do {
1463
+ number *= 16;
1464
+ if (c >= 'a' && c <= 'f')
1465
+ number += c - 'a' + 10;
1466
+ else if (c >= 'A' && c <= 'F')
1467
+ number += c - 'A' + 10;
1468
+ else
1469
+ number += c - '0';
1470
+ c = next();
1471
+ if (allow_underscore && c == '_')
1472
+ c = next();
1473
+ } while (isxdigit(c) && (max_length == 0 || ++length < max_length));
1474
+ return number;
1475
+ }
1476
+
1477
+ long long Lexer::consume_octal_number(int max_length, bool allow_underscore) {
1478
+ char c = current_char();
1479
+ int length = 0;
1480
+ long long number = 0;
1481
+ do {
1482
+ number *= 8;
1483
+ number += c - '0';
1484
+ c = next();
1485
+ if (allow_underscore && c == '_')
1486
+ c = next();
1487
+ } while (c >= '0' && c <= '7' && (max_length == 0 || ++length < max_length));
1488
+ return number;
1489
+ }
1490
+
1491
+ // public domain
1492
+ // https://gist.github.com/Miouyouyou/864130e8734afe3f806512b14022226f
1493
+ void Lexer::utf32_codepoint_to_utf8(String &buf, long long codepoint) {
1494
+ if (codepoint < 0x80) {
1495
+ buf.append_char(codepoint);
1496
+ } else if (codepoint < 0x800) { // 00000yyy yyxxxxxx
1497
+ buf.append_char(0b11000000 | (codepoint >> 6));
1498
+ buf.append_char(0b10000000 | (codepoint & 0x3f));
1499
+ } else if (codepoint < 0x10000) { // zzzzyyyy yyxxxxxx
1500
+ buf.append_char(0b11100000 | (codepoint >> 12));
1501
+ buf.append_char(0b10000000 | ((codepoint >> 6) & 0x3f));
1502
+ buf.append_char(0b10000000 | (codepoint & 0x3f));
1503
+ } else if (codepoint < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
1504
+ buf.append_char(0b11110000 | (codepoint >> 18));
1505
+ buf.append_char(0b10000000 | ((codepoint >> 12) & 0x3f));
1506
+ buf.append_char(0b10000000 | ((codepoint >> 6) & 0x3f));
1507
+ buf.append_char(0b10000000 | (codepoint & 0x3f));
1508
+ } else {
1509
+ TM_UNREACHABLE();
1510
+ }
1511
+ }
1512
+
1513
+ std::pair<bool, Token::Type> Lexer::consume_escaped_byte(String &buf) {
1514
+ auto control_character = [&](bool meta) {
1515
+ char c = next();
1516
+ if (c == '-')
1517
+ c = next();
1518
+ int num = 0;
1519
+ if (!meta && c == '\\' && peek() == 'M') {
1520
+ advance(); // M
1521
+ c = next();
1522
+ if (c != '-')
1523
+ return -1;
1524
+ meta = true;
1525
+ c = next();
1526
+ }
1527
+ if (c == '?')
1528
+ num = 127;
1529
+ else if (c >= ' ' && c <= '>')
1530
+ num = c - ' ';
1531
+ else if (c >= '@' && c <= '_')
1532
+ num = c - '@';
1533
+ else if (c >= '`' && c <= '~')
1534
+ num = c - '`';
1535
+ if (meta)
1536
+ return num + 128;
1537
+ else
1538
+ return num;
1539
+ };
1540
+ auto c = current_char();
1541
+ if (c >= '0' && c <= '7') {
1542
+ auto number = consume_octal_number(3);
1543
+ buf.append_char(number);
1544
+ } else if (c == 'x') {
1545
+ // hex: 1-2 digits
1546
+ advance();
1547
+ auto number = consume_hex_number(2);
1548
+ buf.append_char(number);
1549
+ } else if (c == 'u') {
1550
+ c = next();
1551
+ if (c == '{') {
1552
+ c = next();
1553
+ // unicode characters, space separated, 1-6 hex digits
1554
+ while (c != '}') {
1555
+ if (!isxdigit(c))
1556
+ return { false, Token::Type::InvalidUnicodeEscape };
1557
+ auto codepoint = consume_hex_number(6);
1558
+ utf32_codepoint_to_utf8(buf, codepoint);
1559
+ c = current_char();
1560
+ while (c == ' ')
1561
+ c = next();
1562
+ }
1563
+ if (c == '}')
1564
+ advance();
1565
+ } else {
1566
+ // unicode: 4 hex digits
1567
+ auto codepoint = consume_hex_number(4);
1568
+ utf32_codepoint_to_utf8(buf, codepoint);
1569
+ }
1570
+ } else {
1571
+ switch (c) {
1572
+ case 'a':
1573
+ buf.append_char('\a');
1574
+ break;
1575
+ case 'b':
1576
+ buf.append_char('\b');
1577
+ break;
1578
+ case 'c':
1579
+ case 'C': {
1580
+ int num = control_character(false);
1581
+ if (num == -1)
1582
+ return { false, Token::Type::InvalidCharacterEscape };
1583
+ buf.append_char((unsigned char)num);
1584
+ break;
1585
+ }
1586
+ case 'e':
1587
+ buf.append_char('\e');
1588
+ break;
1589
+ case 'f':
1590
+ buf.append_char('\f');
1591
+ break;
1592
+ case 'M': {
1593
+ c = next();
1594
+ if (c != '-')
1595
+ return { false, Token::Type::InvalidCharacterEscape };
1596
+ c = next();
1597
+ int num = 0;
1598
+ if (c == '\\' && (peek() == 'c' || peek() == 'C')) {
1599
+ advance();
1600
+ num = control_character(true);
1601
+ } else {
1602
+ num = (int)c + 128;
1603
+ }
1604
+ buf.append_char((unsigned char)num);
1605
+ break;
1606
+ }
1607
+ case 'n':
1608
+ buf.append_char('\n');
1609
+ break;
1610
+ case 'r':
1611
+ buf.append_char('\r');
1612
+ break;
1613
+ case 's':
1614
+ buf.append_char((unsigned char)32);
1615
+ break;
1616
+ case 't':
1617
+ buf.append_char('\t');
1618
+ break;
1619
+ case 'v':
1620
+ buf.append_char('\v');
1621
+ break;
1622
+ case '\n':
1623
+ break;
1624
+ default:
1625
+ buf.append_char(c);
1626
+ break;
1627
+ }
1628
+ advance();
1629
+ }
1630
+ return { true, Token::Type::String };
1631
+ }
1632
+
1633
+ bool Lexer::token_is_first_on_line() const {
1634
+ return !m_last_token || m_last_token.is_newline();
1635
+ }
1636
+
1637
+ Token Lexer::consume_double_quoted_string(char start_char, char stop_char, Token::Type begin_type, Token::Type end_type) {
1638
+ m_nested_lexer = new InterpolatedStringLexer { *this, start_char, stop_char, end_type };
1639
+ return Token { begin_type, start_char, m_file, m_token_line, m_token_column };
1640
+ }
1641
+
1642
+ Token Lexer::consume_single_quoted_string(char start_char, char stop_char) {
1643
+ int pair_depth = 0;
1644
+ SharedPtr<String> buf = new String("");
1645
+ char c = current_char();
1646
+ while (c) {
1647
+ if (c == '\\') {
1648
+ c = next();
1649
+ if (c == stop_char || c == '\\') {
1650
+ buf->append_char(c);
1651
+ } else {
1652
+ buf->append_char('\\');
1653
+ buf->append_char(c);
1654
+ }
1655
+ } else if (c == start_char && start_char != stop_char) {
1656
+ pair_depth++;
1657
+ buf->append_char(c);
1658
+ } else if (c == stop_char) {
1659
+ if (pair_depth > 0) {
1660
+ pair_depth--;
1661
+ buf->append_char(c);
1662
+ } else {
1663
+ advance(); // '
1664
+ if (current_char() == ':' && !m_open_ternary) {
1665
+ advance(); // :
1666
+ return Token { Token::Type::SymbolKey, buf, m_file, m_token_line, m_token_column };
1667
+ } else {
1668
+ return Token { Token::Type::String, buf, m_file, m_token_line, m_token_column };
1669
+ }
1670
+ }
1671
+ } else {
1672
+ buf->append_char(c);
1673
+ }
1674
+ c = next();
1675
+ }
1676
+ return Token { Token::Type::UnterminatedString, start_char, m_file, m_token_line, m_token_column };
1677
+ }
1678
+
1679
+ Token Lexer::consume_quoted_array_without_interpolation(char start_char, char stop_char, Token::Type type) {
1680
+ m_nested_lexer = new WordArrayLexer { *this, start_char, stop_char, false };
1681
+ return Token { type, start_char, m_file, m_token_line, m_token_column };
1682
+ }
1683
+
1684
+ Token Lexer::consume_quoted_array_with_interpolation(char start_char, char stop_char, Token::Type type) {
1685
+ m_nested_lexer = new WordArrayLexer { *this, start_char, stop_char, true };
1686
+ return Token { type, start_char, m_file, m_token_line, m_token_column };
1687
+ }
1688
+
1689
+ Token Lexer::consume_regexp(char start_char, char stop_char) {
1690
+ m_nested_lexer = new RegexpLexer { *this, start_char, stop_char };
1691
+ return Token { Token::Type::InterpolatedRegexpBegin, start_char, m_file, m_token_line, m_token_column };
1692
+ }
1693
+
1694
+ SharedPtr<String> Lexer::consume_non_whitespace() {
1695
+ char c = current_char();
1696
+ SharedPtr<String> buf = new String("");
1697
+ do {
1698
+ buf->append_char(c);
1699
+ c = next();
1700
+ } while (c && c != ' ' && c != '\t' && c != '\n' && c != '\r');
1701
+ return buf;
1702
+ }
1703
+ };