jruby-prism-parser 0.23.0.pre.SNAPSHOT-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (110) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +401 -0
  3. data/CODE_OF_CONDUCT.md +76 -0
  4. data/CONTRIBUTING.md +62 -0
  5. data/LICENSE.md +7 -0
  6. data/Makefile +101 -0
  7. data/README.md +98 -0
  8. data/config.yml +2902 -0
  9. data/docs/build_system.md +91 -0
  10. data/docs/configuration.md +64 -0
  11. data/docs/cruby_compilation.md +27 -0
  12. data/docs/design.md +53 -0
  13. data/docs/encoding.md +121 -0
  14. data/docs/fuzzing.md +88 -0
  15. data/docs/heredocs.md +36 -0
  16. data/docs/javascript.md +118 -0
  17. data/docs/local_variable_depth.md +229 -0
  18. data/docs/mapping.md +117 -0
  19. data/docs/parser_translation.md +34 -0
  20. data/docs/parsing_rules.md +19 -0
  21. data/docs/releasing.md +98 -0
  22. data/docs/ripper.md +36 -0
  23. data/docs/ruby_api.md +43 -0
  24. data/docs/ruby_parser_translation.md +19 -0
  25. data/docs/serialization.md +209 -0
  26. data/docs/testing.md +55 -0
  27. data/ext/prism/api_node.c +5098 -0
  28. data/ext/prism/api_pack.c +267 -0
  29. data/ext/prism/extconf.rb +110 -0
  30. data/ext/prism/extension.c +1155 -0
  31. data/ext/prism/extension.h +18 -0
  32. data/include/prism/ast.h +5807 -0
  33. data/include/prism/defines.h +102 -0
  34. data/include/prism/diagnostic.h +339 -0
  35. data/include/prism/encoding.h +265 -0
  36. data/include/prism/node.h +57 -0
  37. data/include/prism/options.h +230 -0
  38. data/include/prism/pack.h +152 -0
  39. data/include/prism/parser.h +732 -0
  40. data/include/prism/prettyprint.h +26 -0
  41. data/include/prism/regexp.h +33 -0
  42. data/include/prism/util/pm_buffer.h +155 -0
  43. data/include/prism/util/pm_char.h +205 -0
  44. data/include/prism/util/pm_constant_pool.h +209 -0
  45. data/include/prism/util/pm_list.h +97 -0
  46. data/include/prism/util/pm_memchr.h +29 -0
  47. data/include/prism/util/pm_newline_list.h +93 -0
  48. data/include/prism/util/pm_state_stack.h +42 -0
  49. data/include/prism/util/pm_string.h +150 -0
  50. data/include/prism/util/pm_string_list.h +44 -0
  51. data/include/prism/util/pm_strncasecmp.h +32 -0
  52. data/include/prism/util/pm_strpbrk.h +46 -0
  53. data/include/prism/version.h +29 -0
  54. data/include/prism.h +289 -0
  55. data/jruby-prism.jar +0 -0
  56. data/lib/prism/compiler.rb +486 -0
  57. data/lib/prism/debug.rb +206 -0
  58. data/lib/prism/desugar_compiler.rb +207 -0
  59. data/lib/prism/dispatcher.rb +2150 -0
  60. data/lib/prism/dot_visitor.rb +4634 -0
  61. data/lib/prism/dsl.rb +785 -0
  62. data/lib/prism/ffi.rb +346 -0
  63. data/lib/prism/lex_compat.rb +908 -0
  64. data/lib/prism/mutation_compiler.rb +753 -0
  65. data/lib/prism/node.rb +17864 -0
  66. data/lib/prism/node_ext.rb +212 -0
  67. data/lib/prism/node_inspector.rb +68 -0
  68. data/lib/prism/pack.rb +224 -0
  69. data/lib/prism/parse_result/comments.rb +177 -0
  70. data/lib/prism/parse_result/newlines.rb +64 -0
  71. data/lib/prism/parse_result.rb +498 -0
  72. data/lib/prism/pattern.rb +250 -0
  73. data/lib/prism/serialize.rb +1354 -0
  74. data/lib/prism/translation/parser/compiler.rb +1838 -0
  75. data/lib/prism/translation/parser/lexer.rb +335 -0
  76. data/lib/prism/translation/parser/rubocop.rb +37 -0
  77. data/lib/prism/translation/parser.rb +178 -0
  78. data/lib/prism/translation/ripper.rb +577 -0
  79. data/lib/prism/translation/ruby_parser.rb +1521 -0
  80. data/lib/prism/translation.rb +11 -0
  81. data/lib/prism/version.rb +3 -0
  82. data/lib/prism/visitor.rb +495 -0
  83. data/lib/prism.rb +99 -0
  84. data/prism.gemspec +135 -0
  85. data/rbi/prism.rbi +7767 -0
  86. data/rbi/prism_static.rbi +207 -0
  87. data/sig/prism.rbs +4773 -0
  88. data/sig/prism_static.rbs +201 -0
  89. data/src/diagnostic.c +400 -0
  90. data/src/encoding.c +5132 -0
  91. data/src/node.c +2786 -0
  92. data/src/options.c +213 -0
  93. data/src/pack.c +493 -0
  94. data/src/prettyprint.c +8881 -0
  95. data/src/prism.c +18406 -0
  96. data/src/regexp.c +638 -0
  97. data/src/serialize.c +1554 -0
  98. data/src/token_type.c +700 -0
  99. data/src/util/pm_buffer.c +190 -0
  100. data/src/util/pm_char.c +318 -0
  101. data/src/util/pm_constant_pool.c +322 -0
  102. data/src/util/pm_list.c +49 -0
  103. data/src/util/pm_memchr.c +35 -0
  104. data/src/util/pm_newline_list.c +84 -0
  105. data/src/util/pm_state_stack.c +25 -0
  106. data/src/util/pm_string.c +203 -0
  107. data/src/util/pm_string_list.c +28 -0
  108. data/src/util/pm_strncasecmp.c +24 -0
  109. data/src/util/pm_strpbrk.c +180 -0
  110. metadata +156 -0
@@ -0,0 +1,732 @@
1
+ /**
2
+ * @file parser.h
3
+ *
4
+ * The parser used to parse Ruby source.
5
+ */
6
+ #ifndef PRISM_PARSER_H
7
+ #define PRISM_PARSER_H
8
+
9
+ #include "prism/ast.h"
10
+ #include "prism/defines.h"
11
+ #include "prism/encoding.h"
12
+ #include "prism/options.h"
13
+ #include "prism/util/pm_constant_pool.h"
14
+ #include "prism/util/pm_list.h"
15
+ #include "prism/util/pm_newline_list.h"
16
+ #include "prism/util/pm_state_stack.h"
17
+ #include "prism/util/pm_string.h"
18
+
19
+ #include <stdbool.h>
20
+
21
+ /**
22
+ * This enum provides various bits that represent different kinds of states that
23
+ * the lexer can track. This is used to determine which kind of token to return
24
+ * based on the context of the parser.
25
+ */
26
+ typedef enum {
27
+ PM_LEX_STATE_BIT_BEG,
28
+ PM_LEX_STATE_BIT_END,
29
+ PM_LEX_STATE_BIT_ENDARG,
30
+ PM_LEX_STATE_BIT_ENDFN,
31
+ PM_LEX_STATE_BIT_ARG,
32
+ PM_LEX_STATE_BIT_CMDARG,
33
+ PM_LEX_STATE_BIT_MID,
34
+ PM_LEX_STATE_BIT_FNAME,
35
+ PM_LEX_STATE_BIT_DOT,
36
+ PM_LEX_STATE_BIT_CLASS,
37
+ PM_LEX_STATE_BIT_LABEL,
38
+ PM_LEX_STATE_BIT_LABELED,
39
+ PM_LEX_STATE_BIT_FITEM
40
+ } pm_lex_state_bit_t;
41
+
42
+ /**
43
+ * This enum combines the various bits from the above enum into individual
44
+ * values that represent the various states of the lexer.
45
+ */
46
+ typedef enum {
47
+ PM_LEX_STATE_NONE = 0,
48
+ PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG),
49
+ PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END),
50
+ PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG),
51
+ PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN),
52
+ PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG),
53
+ PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG),
54
+ PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID),
55
+ PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME),
56
+ PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT),
57
+ PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS),
58
+ PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL),
59
+ PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED),
60
+ PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM),
61
+ PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS,
62
+ PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG,
63
+ PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN
64
+ } pm_lex_state_t;
65
+
66
+ /**
67
+ * The type of quote that a heredoc uses.
68
+ */
69
+ typedef enum {
70
+ PM_HEREDOC_QUOTE_NONE,
71
+ PM_HEREDOC_QUOTE_SINGLE = '\'',
72
+ PM_HEREDOC_QUOTE_DOUBLE = '"',
73
+ PM_HEREDOC_QUOTE_BACKTICK = '`',
74
+ } pm_heredoc_quote_t;
75
+
76
+ /**
77
+ * The type of indentation that a heredoc uses.
78
+ */
79
+ typedef enum {
80
+ PM_HEREDOC_INDENT_NONE,
81
+ PM_HEREDOC_INDENT_DASH,
82
+ PM_HEREDOC_INDENT_TILDE,
83
+ } pm_heredoc_indent_t;
84
+
85
+ /**
86
+ * When lexing Ruby source, the lexer has a small amount of state to tell which
87
+ * kind of token it is currently lexing. For example, when we find the start of
88
+ * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
89
+ * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
90
+ * are found as part of a string.
91
+ */
92
+ typedef struct pm_lex_mode {
93
+ /** The type of this lex mode. */
94
+ enum {
95
+ /** This state is used when any given token is being lexed. */
96
+ PM_LEX_DEFAULT,
97
+
98
+ /**
99
+ * This state is used when we're lexing as normal but inside an embedded
100
+ * expression of a string.
101
+ */
102
+ PM_LEX_EMBEXPR,
103
+
104
+ /**
105
+ * This state is used when we're lexing a variable that is embedded
106
+ * directly inside of a string with the # shorthand.
107
+ */
108
+ PM_LEX_EMBVAR,
109
+
110
+ /** This state is used when you are inside the content of a heredoc. */
111
+ PM_LEX_HEREDOC,
112
+
113
+ /**
114
+ * This state is used when we are lexing a list of tokens, as in a %w
115
+ * word list literal or a %i symbol list literal.
116
+ */
117
+ PM_LEX_LIST,
118
+
119
+ /**
120
+ * This state is used when a regular expression has been begun and we
121
+ * are looking for the terminator.
122
+ */
123
+ PM_LEX_REGEXP,
124
+
125
+ /**
126
+ * This state is used when we are lexing a string or a string-like
127
+ * token, as in string content with either quote or an xstring.
128
+ */
129
+ PM_LEX_STRING
130
+ } mode;
131
+
132
+ /** The data associated with this type of lex mode. */
133
+ union {
134
+ struct {
135
+ /** This keeps track of the nesting level of the list. */
136
+ size_t nesting;
137
+
138
+ /** Whether or not interpolation is allowed in this list. */
139
+ bool interpolation;
140
+
141
+ /**
142
+ * When lexing a list, it takes into account balancing the
143
+ * terminator if the terminator is one of (), [], {}, or <>.
144
+ */
145
+ uint8_t incrementor;
146
+
147
+ /** This is the terminator of the list literal. */
148
+ uint8_t terminator;
149
+
150
+ /**
151
+ * This is the character set that should be used to delimit the
152
+ * tokens within the list.
153
+ */
154
+ uint8_t breakpoints[11];
155
+ } list;
156
+
157
+ struct {
158
+ /**
159
+ * This keeps track of the nesting level of the regular expression.
160
+ */
161
+ size_t nesting;
162
+
163
+ /**
164
+ * When lexing a regular expression, it takes into account balancing
165
+ * the terminator if the terminator is one of (), [], {}, or <>.
166
+ */
167
+ uint8_t incrementor;
168
+
169
+ /** This is the terminator of the regular expression. */
170
+ uint8_t terminator;
171
+
172
+ /**
173
+ * This is the character set that should be used to delimit the
174
+ * tokens within the regular expression.
175
+ */
176
+ uint8_t breakpoints[6];
177
+ } regexp;
178
+
179
+ struct {
180
+ /** This keeps track of the nesting level of the string. */
181
+ size_t nesting;
182
+
183
+ /** Whether or not interpolation is allowed in this string. */
184
+ bool interpolation;
185
+
186
+ /**
187
+ * Whether or not at the end of the string we should allow a :,
188
+ * which would indicate this was a dynamic symbol instead of a
189
+ * string.
190
+ */
191
+ bool label_allowed;
192
+
193
+ /**
194
+ * When lexing a string, it takes into account balancing the
195
+ * terminator if the terminator is one of (), [], {}, or <>.
196
+ */
197
+ uint8_t incrementor;
198
+
199
+ /**
200
+ * This is the terminator of the string. It is typically either a
201
+ * single or double quote.
202
+ */
203
+ uint8_t terminator;
204
+
205
+ /**
206
+ * This is the character set that should be used to delimit the
207
+ * tokens within the string.
208
+ */
209
+ uint8_t breakpoints[6];
210
+ } string;
211
+
212
+ struct {
213
+ /** A pointer to the start of the heredoc identifier. */
214
+ const uint8_t *ident_start;
215
+
216
+ /** The length of the heredoc identifier. */
217
+ size_t ident_length;
218
+
219
+ /** The type of quote that the heredoc uses. */
220
+ pm_heredoc_quote_t quote;
221
+
222
+ /** The type of indentation that the heredoc uses. */
223
+ pm_heredoc_indent_t indent;
224
+
225
+ /**
226
+ * This is the pointer to the character where lexing should resume
227
+ * once the heredoc has been completely processed.
228
+ */
229
+ const uint8_t *next_start;
230
+
231
+ /**
232
+ * This is used to track the amount of common whitespace on each
233
+ * line so that we know how much to dedent each line in the case of
234
+ * a tilde heredoc.
235
+ */
236
+ size_t common_whitespace;
237
+ } heredoc;
238
+ } as;
239
+
240
+ /** The previous lex state so that it knows how to pop. */
241
+ struct pm_lex_mode *prev;
242
+ } pm_lex_mode_t;
243
+
244
+ /**
245
+ * We pre-allocate a certain number of lex states in order to avoid having to
246
+ * call malloc too many times while parsing. You really shouldn't need more than
247
+ * this because you only really nest deeply when doing string interpolation.
248
+ */
249
+ #define PM_LEX_STACK_SIZE 4
250
+
251
+ /**
252
+ * The parser used to parse Ruby source.
253
+ */
254
+ typedef struct pm_parser pm_parser_t;
255
+
256
+ /**
257
+ * While parsing, we keep track of a stack of contexts. This is helpful for
258
+ * error recovery so that we can pop back to a previous context when we hit a
259
+ * token that is understood by a parent context but not by the current context.
260
+ */
261
+ typedef enum {
262
+ /** a null context, used for returning a value from a function */
263
+ PM_CONTEXT_NONE = 0,
264
+
265
+ /** a begin statement */
266
+ PM_CONTEXT_BEGIN,
267
+
268
+ /** expressions in block arguments using braces */
269
+ PM_CONTEXT_BLOCK_BRACES,
270
+
271
+ /** expressions in block arguments using do..end */
272
+ PM_CONTEXT_BLOCK_KEYWORDS,
273
+
274
+ /** a case when statements */
275
+ PM_CONTEXT_CASE_WHEN,
276
+
277
+ /** a case in statements */
278
+ PM_CONTEXT_CASE_IN,
279
+
280
+ /** a class declaration */
281
+ PM_CONTEXT_CLASS,
282
+
283
+ /** a method definition */
284
+ PM_CONTEXT_DEF,
285
+
286
+ /** a method definition's parameters */
287
+ PM_CONTEXT_DEF_PARAMS,
288
+
289
+ /** a method definition's default parameter */
290
+ PM_CONTEXT_DEFAULT_PARAMS,
291
+
292
+ /** an else clause */
293
+ PM_CONTEXT_ELSE,
294
+
295
+ /** an elsif clause */
296
+ PM_CONTEXT_ELSIF,
297
+
298
+ /** an interpolated expression */
299
+ PM_CONTEXT_EMBEXPR,
300
+
301
+ /** an ensure statement */
302
+ PM_CONTEXT_ENSURE,
303
+
304
+ /** an ensure statement within a method definition */
305
+ PM_CONTEXT_ENSURE_DEF,
306
+
307
+ /** a for loop */
308
+ PM_CONTEXT_FOR,
309
+
310
+ /** a for loop's index */
311
+ PM_CONTEXT_FOR_INDEX,
312
+
313
+ /** an if statement */
314
+ PM_CONTEXT_IF,
315
+
316
+ /** a lambda expression with braces */
317
+ PM_CONTEXT_LAMBDA_BRACES,
318
+
319
+ /** a lambda expression with do..end */
320
+ PM_CONTEXT_LAMBDA_DO_END,
321
+
322
+ /** the top level context */
323
+ PM_CONTEXT_MAIN,
324
+
325
+ /** a module declaration */
326
+ PM_CONTEXT_MODULE,
327
+
328
+ /** a parenthesized expression */
329
+ PM_CONTEXT_PARENS,
330
+
331
+ /** an END block */
332
+ PM_CONTEXT_POSTEXE,
333
+
334
+ /** a predicate inside an if/elsif/unless statement */
335
+ PM_CONTEXT_PREDICATE,
336
+
337
+ /** a BEGIN block */
338
+ PM_CONTEXT_PREEXE,
339
+
340
+ /** a rescue else statement */
341
+ PM_CONTEXT_RESCUE_ELSE,
342
+
343
+ /** a rescue else statement within a method definition */
344
+ PM_CONTEXT_RESCUE_ELSE_DEF,
345
+
346
+ /** a rescue statement */
347
+ PM_CONTEXT_RESCUE,
348
+
349
+ /** a rescue statement within a method definition */
350
+ PM_CONTEXT_RESCUE_DEF,
351
+
352
+ /** a singleton class definition */
353
+ PM_CONTEXT_SCLASS,
354
+
355
+ /** an unless statement */
356
+ PM_CONTEXT_UNLESS,
357
+
358
+ /** an until statement */
359
+ PM_CONTEXT_UNTIL,
360
+
361
+ /** a while statement */
362
+ PM_CONTEXT_WHILE,
363
+ } pm_context_t;
364
+
365
+ /** This is a node in a linked list of contexts. */
366
+ typedef struct pm_context_node {
367
+ /** The context that this node represents. */
368
+ pm_context_t context;
369
+
370
+ /** A pointer to the previous context in the linked list. */
371
+ struct pm_context_node *prev;
372
+ } pm_context_node_t;
373
+
374
+ /** This is the type of a comment that we've found while parsing. */
375
+ typedef enum {
376
+ PM_COMMENT_INLINE,
377
+ PM_COMMENT_EMBDOC
378
+ } pm_comment_type_t;
379
+
380
+ /**
381
+ * This is a node in the linked list of comments that we've found while parsing.
382
+ *
383
+ * @extends pm_list_node_t
384
+ */
385
+ typedef struct pm_comment {
386
+ /** The embedded base node. */
387
+ pm_list_node_t node;
388
+
389
+ /** The location of the comment in the source. */
390
+ pm_location_t location;
391
+
392
+ /** The type of comment that we've found. */
393
+ pm_comment_type_t type;
394
+ } pm_comment_t;
395
+
396
+ /**
397
+ * This is a node in the linked list of magic comments that we've found while
398
+ * parsing.
399
+ *
400
+ * @extends pm_list_node_t
401
+ */
402
+ typedef struct {
403
+ /** The embedded base node. */
404
+ pm_list_node_t node;
405
+
406
+ /** A pointer to the start of the key in the source. */
407
+ const uint8_t *key_start;
408
+
409
+ /** A pointer to the start of the value in the source. */
410
+ const uint8_t *value_start;
411
+
412
+ /** The length of the key in the source. */
413
+ uint32_t key_length;
414
+
415
+ /** The length of the value in the source. */
416
+ uint32_t value_length;
417
+ } pm_magic_comment_t;
418
+
419
+ /**
420
+ * When the encoding that is being used to parse the source is changed by prism,
421
+ * we provide the ability here to call out to a user-defined function.
422
+ */
423
+ typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser);
424
+
425
+ /**
426
+ * When you are lexing through a file, the lexer needs all of the information
427
+ * that the parser additionally provides (for example, the local table). So if
428
+ * you want to properly lex Ruby, you need to actually lex it in the context of
429
+ * the parser. In order to provide this functionality, we optionally allow a
430
+ * struct to be attached to the parser that calls back out to a user-provided
431
+ * callback when each token is lexed.
432
+ */
433
+ typedef struct {
434
+ /**
435
+ * This opaque pointer is used to provide whatever information the user
436
+ * deemed necessary to the callback. In our case we use it to pass the array
437
+ * that the tokens get appended into.
438
+ */
439
+ void *data;
440
+
441
+ /**
442
+ * This is the callback that is called when a token is lexed. It is passed
443
+ * the opaque data pointer, the parser, and the token that was lexed.
444
+ */
445
+ void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token);
446
+ } pm_lex_callback_t;
447
+
448
+ /**
449
+ * This struct represents a node in a linked list of scopes. Some scopes can see
450
+ * into their parent scopes, while others cannot.
451
+ */
452
+ typedef struct pm_scope {
453
+ /** The IDs of the locals in the given scope. */
454
+ pm_constant_id_list_t locals;
455
+
456
+ /** A pointer to the previous scope in the linked list. */
457
+ struct pm_scope *previous;
458
+
459
+ /**
460
+ * A boolean indicating whether or not this scope can see into its parent.
461
+ * If closed is true, then the scope cannot see into its parent.
462
+ */
463
+ bool closed;
464
+
465
+ /**
466
+ * A boolean indicating whether or not this scope has explicit parameters.
467
+ * This is necessary to determine whether or not numbered parameters are
468
+ * allowed.
469
+ */
470
+ bool explicit_params;
471
+
472
+ /**
473
+ * Booleans indicating whether the parameters for this scope have declared
474
+ * forwarding parameters.
475
+ *
476
+ * For example, some combinations of:
477
+ * def foo(*); end
478
+ * def foo(**); end
479
+ * def foo(&); end
480
+ * def foo(...); end
481
+ */
482
+
483
+ uint8_t forwarding_params;
484
+
485
+ /**
486
+ * An integer indicating the number of numbered parameters on this scope.
487
+ * This is necessary to determine if child blocks are allowed to use
488
+ * numbered parameters, and to pass information to consumers of the AST
489
+ * about how many numbered parameters exist.
490
+ */
491
+ uint8_t numbered_parameters;
492
+ } pm_scope_t;
493
+
494
+ static const uint8_t PM_FORWARDING_POSITIONALS = 0x1;
495
+ static const uint8_t PM_FORWARDING_KEYWORDS = 0x2;
496
+ static const uint8_t PM_FORWARDING_BLOCK = 0x4;
497
+ static const uint8_t PM_FORWARDING_ALL = 0x8;
498
+
499
+ /**
500
+ * This struct represents the overall parser. It contains a reference to the
501
+ * source file, as well as pointers that indicate where in the source it's
502
+ * currently parsing. It also contains the most recent and current token that
503
+ * it's considering.
504
+ */
505
+ struct pm_parser {
506
+ /** The current state of the lexer. */
507
+ pm_lex_state_t lex_state;
508
+
509
+ /** Tracks the current nesting of (), [], and {}. */
510
+ int enclosure_nesting;
511
+
512
+ /**
513
+ * Used to temporarily track the nesting of enclosures to determine if a {
514
+ * is the beginning of a lambda following the parameters of a lambda.
515
+ */
516
+ int lambda_enclosure_nesting;
517
+
518
+ /**
519
+ * Used to track the nesting of braces to ensure we get the correct value
520
+ * when we are interpolating blocks with braces.
521
+ */
522
+ int brace_nesting;
523
+
524
+ /**
525
+ * The stack used to determine if a do keyword belongs to the predicate of a
526
+ * while, until, or for loop.
527
+ */
528
+ pm_state_stack_t do_loop_stack;
529
+
530
+ /**
531
+ * The stack used to determine if a do keyword belongs to the beginning of a
532
+ * block.
533
+ */
534
+ pm_state_stack_t accepts_block_stack;
535
+
536
+ /** A stack of lex modes. */
537
+ struct {
538
+ /** The current mode of the lexer. */
539
+ pm_lex_mode_t *current;
540
+
541
+ /** The stack of lexer modes. */
542
+ pm_lex_mode_t stack[PM_LEX_STACK_SIZE];
543
+
544
+ /** The current index into the lexer mode stack. */
545
+ size_t index;
546
+ } lex_modes;
547
+
548
+ /** The pointer to the start of the source. */
549
+ const uint8_t *start;
550
+
551
+ /** The pointer to the end of the source. */
552
+ const uint8_t *end;
553
+
554
+ /** The previous token we were considering. */
555
+ pm_token_t previous;
556
+
557
+ /** The current token we're considering. */
558
+ pm_token_t current;
559
+
560
+ /**
561
+ * This is a special field set on the parser when we need the parser to jump
562
+ * to a specific location when lexing the next token, as opposed to just
563
+ * using the end of the previous token. Normally this is NULL.
564
+ */
565
+ const uint8_t *next_start;
566
+
567
+ /**
568
+ * This field indicates the end of a heredoc whose identifier was found on
569
+ * the current line. If another heredoc is found on the same line, then this
570
+ * will be moved forward to the end of that heredoc. If no heredocs are
571
+ * found on a line then this is NULL.
572
+ */
573
+ const uint8_t *heredoc_end;
574
+
575
+ /** The list of comments that have been found while parsing. */
576
+ pm_list_t comment_list;
577
+
578
+ /** The list of magic comments that have been found while parsing. */
579
+ pm_list_t magic_comment_list;
580
+
581
+ /**
582
+ * An optional location that represents the location of the __END__ marker
583
+ * and the rest of the content of the file. This content is loaded into the
584
+ * DATA constant when the file being parsed is the main file being executed.
585
+ */
586
+ pm_location_t data_loc;
587
+
588
+ /** The list of warnings that have been found while parsing. */
589
+ pm_list_t warning_list;
590
+
591
+ /** The list of errors that have been found while parsing. */
592
+ pm_list_t error_list;
593
+
594
+ /** The current local scope. */
595
+ pm_scope_t *current_scope;
596
+
597
+ /** The current parsing context. */
598
+ pm_context_node_t *current_context;
599
+
600
+ /**
601
+ * The encoding functions for the current file is attached to the parser as
602
+ * it's parsing so that it can change with a magic comment.
603
+ */
604
+ const pm_encoding_t *encoding;
605
+
606
+ /**
607
+ * When the encoding that is being used to parse the source is changed by
608
+ * prism, we provide the ability here to call out to a user-defined
609
+ * function.
610
+ */
611
+ pm_encoding_changed_callback_t encoding_changed_callback;
612
+
613
+ /**
614
+ * This pointer indicates where a comment must start if it is to be
615
+ * considered an encoding comment.
616
+ */
617
+ const uint8_t *encoding_comment_start;
618
+
619
+ /**
620
+ * This is an optional callback that can be attached to the parser that will
621
+ * be called whenever a new token is lexed by the parser.
622
+ */
623
+ pm_lex_callback_t *lex_callback;
624
+
625
+ /**
626
+ * This is the path of the file being parsed. We use the filepath when
627
+ * constructing SourceFileNodes.
628
+ */
629
+ pm_string_t filepath;
630
+
631
+ /**
632
+ * This constant pool keeps all of the constants defined throughout the file
633
+ * so that we can reference them later.
634
+ */
635
+ pm_constant_pool_t constant_pool;
636
+
637
+ /** This is the list of newline offsets in the source file. */
638
+ pm_newline_list_t newline_list;
639
+
640
+ /**
641
+ * We want to add a flag to integer nodes that indicates their base. We only
642
+ * want to parse these once, but we don't have space on the token itself to
643
+ * communicate this information. So we store it here and pass it through
644
+ * when we find tokens that we need it for.
645
+ */
646
+ pm_node_flags_t integer_base;
647
+
648
+ /**
649
+ * This string is used to pass information from the lexer to the parser. It
650
+ * is particularly necessary because of escape sequences.
651
+ */
652
+ pm_string_t current_string;
653
+
654
+ /**
655
+ * The line number at the start of the parse. This will be used to offset
656
+ * the line numbers of all of the locations.
657
+ */
658
+ int32_t start_line;
659
+
660
+ /**
661
+ * When a string-like expression is being lexed, any byte or escape sequence
662
+ * that resolves to a value whose top bit is set (i.e., >= 0x80) will
663
+ * explicitly set the encoding to the same encoding as the source.
664
+ * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
665
+ * resolves to a value whose top bit is set, then the encoding will be
666
+ * explicitly set to UTF-8.
667
+ *
668
+ * The _next_ time this happens, if the encoding that is about to become the
669
+ * explicitly set encoding does not match the previously set explicit
670
+ * encoding, a mixed encoding error will be emitted.
671
+ *
672
+ * When the expression is finished being lexed, the explicit encoding
673
+ * controls the encoding of the expression. For the most part this means
674
+ * that the expression will either be encoded in the source encoding or
675
+ * UTF-8. This holds for all encodings except US-ASCII. If the source is
676
+ * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
677
+ * expression will be encoded as ASCII-8BIT.
678
+ *
679
+ * Note that if the expression is a list, different elements within the same
680
+ * list can have different encodings, so this will get reset between each
681
+ * element. Furthermore all of this only applies to lists that support
682
+ * interpolation, because otherwise escapes that could change the encoding
683
+ * are ignored.
684
+ *
685
+ * At first glance, it may make more sense for this to live on the lexer
686
+ * mode, but we need it here to communicate back to the parser for character
687
+ * literals that do not push a new lexer mode.
688
+ */
689
+ const pm_encoding_t *explicit_encoding;
690
+
691
+ /** The current parameter name id on parsing its default value. */
692
+ pm_constant_id_t current_param_name;
693
+
694
+ /** The version of prism that we should use to parse. */
695
+ pm_options_version_t version;
696
+
697
+ /** Whether or not we're at the beginning of a command. */
698
+ bool command_start;
699
+
700
+ /** Whether or not we're currently recovering from a syntax error. */
701
+ bool recovering;
702
+
703
+ /**
704
+ * Whether or not the encoding has been changed by a magic comment. We use
705
+ * this to provide a fast path for the lexer instead of going through the
706
+ * function pointer.
707
+ */
708
+ bool encoding_changed;
709
+
710
+ /**
711
+ * This flag indicates that we are currently parsing a pattern matching
712
+ * expression and impacts that calculation of newlines.
713
+ */
714
+ bool pattern_matching_newlines;
715
+
716
+ /** This flag indicates that we are currently parsing a keyword argument. */
717
+ bool in_keyword_arg;
718
+
719
+ /**
720
+ * Whether or not the parser has seen a token that has semantic meaning
721
+ * (i.e., a token that is not a comment or whitespace).
722
+ */
723
+ bool semantic_token_seen;
724
+
725
+ /**
726
+ * Whether or not we have found a frozen_string_literal magic comment with
727
+ * a true value.
728
+ */
729
+ bool frozen_string_literal;
730
+ };
731
+
732
+ #endif