prism 0.16.0 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -1
- data/Makefile +6 -0
- data/README.md +1 -1
- data/config.yml +50 -35
- data/docs/fuzzing.md +1 -1
- data/docs/serialization.md +28 -29
- data/ext/prism/api_node.c +802 -770
- data/ext/prism/api_pack.c +20 -9
- data/ext/prism/extension.c +464 -162
- data/ext/prism/extension.h +1 -1
- data/include/prism/ast.h +3173 -763
- data/include/prism/defines.h +32 -9
- data/include/prism/diagnostic.h +36 -3
- data/include/prism/enc/pm_encoding.h +118 -28
- data/include/prism/node.h +38 -13
- data/include/prism/options.h +204 -0
- data/include/prism/pack.h +44 -33
- data/include/prism/parser.h +445 -200
- data/include/prism/prettyprint.h +12 -1
- data/include/prism/regexp.h +16 -2
- data/include/prism/util/pm_buffer.h +94 -16
- data/include/prism/util/pm_char.h +162 -48
- data/include/prism/util/pm_constant_pool.h +126 -32
- data/include/prism/util/pm_list.h +68 -38
- data/include/prism/util/pm_memchr.h +18 -3
- data/include/prism/util/pm_newline_list.h +70 -27
- data/include/prism/util/pm_state_stack.h +25 -7
- data/include/prism/util/pm_string.h +115 -27
- data/include/prism/util/pm_string_list.h +25 -6
- data/include/prism/util/pm_strncasecmp.h +32 -0
- data/include/prism/util/pm_strpbrk.h +31 -17
- data/include/prism/version.h +27 -2
- data/include/prism.h +224 -31
- data/lib/prism/compiler.rb +6 -3
- data/lib/prism/debug.rb +23 -7
- data/lib/prism/dispatcher.rb +33 -18
- data/lib/prism/dsl.rb +10 -5
- data/lib/prism/ffi.rb +132 -80
- data/lib/prism/lex_compat.rb +25 -15
- data/lib/prism/mutation_compiler.rb +10 -5
- data/lib/prism/node.rb +370 -135
- data/lib/prism/node_ext.rb +1 -1
- data/lib/prism/node_inspector.rb +1 -1
- data/lib/prism/pack.rb +79 -40
- data/lib/prism/parse_result/comments.rb +7 -2
- data/lib/prism/parse_result/newlines.rb +4 -0
- data/lib/prism/parse_result.rb +150 -30
- data/lib/prism/pattern.rb +11 -0
- data/lib/prism/ripper_compat.rb +28 -10
- data/lib/prism/serialize.rb +86 -54
- data/lib/prism/visitor.rb +10 -3
- data/lib/prism.rb +20 -2
- data/prism.gemspec +4 -2
- data/rbi/prism.rbi +104 -60
- data/rbi/prism_static.rbi +16 -2
- data/sig/prism.rbs +72 -43
- data/sig/prism_static.rbs +14 -1
- data/src/diagnostic.c +56 -53
- data/src/enc/pm_big5.c +1 -0
- data/src/enc/pm_euc_jp.c +1 -0
- data/src/enc/pm_gbk.c +1 -0
- data/src/enc/pm_shift_jis.c +1 -0
- data/src/enc/pm_tables.c +316 -80
- data/src/enc/pm_unicode.c +53 -8
- data/src/enc/pm_windows_31j.c +1 -0
- data/src/node.c +334 -321
- data/src/options.c +170 -0
- data/src/prettyprint.c +74 -47
- data/src/prism.c +1642 -856
- data/src/regexp.c +151 -95
- data/src/serialize.c +44 -20
- data/src/token_type.c +3 -1
- data/src/util/pm_buffer.c +45 -15
- data/src/util/pm_char.c +103 -57
- data/src/util/pm_constant_pool.c +51 -21
- data/src/util/pm_list.c +12 -4
- data/src/util/pm_memchr.c +5 -3
- data/src/util/pm_newline_list.c +20 -12
- data/src/util/pm_state_stack.c +9 -3
- data/src/util/pm_string.c +95 -85
- data/src/util/pm_string_list.c +14 -15
- data/src/util/pm_strncasecmp.c +10 -3
- data/src/util/pm_strpbrk.c +25 -19
- metadata +5 -3
- data/docs/prism.png +0 -0
data/include/prism/parser.h
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
/**
|
2
|
+
* @file parser.h
|
3
|
+
*
|
4
|
+
* The parser used to parse Ruby source.
|
5
|
+
*/
|
1
6
|
#ifndef PRISM_PARSER_H
|
2
7
|
#define PRISM_PARSER_H
|
3
8
|
|
@@ -12,9 +17,11 @@
|
|
12
17
|
|
13
18
|
#include <stdbool.h>
|
14
19
|
|
15
|
-
|
16
|
-
|
17
|
-
|
20
|
+
/**
|
21
|
+
* This enum provides various bits that represent different kinds of states that
|
22
|
+
* the lexer can track. This is used to determine which kind of token to return
|
23
|
+
* based on the context of the parser.
|
24
|
+
*/
|
18
25
|
typedef enum {
|
19
26
|
PM_LEX_STATE_BIT_BEG,
|
20
27
|
PM_LEX_STATE_BIT_END,
|
@@ -31,8 +38,10 @@ typedef enum {
|
|
31
38
|
PM_LEX_STATE_BIT_FITEM
|
32
39
|
} pm_lex_state_bit_t;
|
33
40
|
|
34
|
-
|
35
|
-
|
41
|
+
/**
|
42
|
+
* This enum combines the various bits from the above enum into individual
|
43
|
+
* values that represent the various states of the lexer.
|
44
|
+
*/
|
36
45
|
typedef enum {
|
37
46
|
PM_LEX_STATE_NONE = 0,
|
38
47
|
PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG),
|
@@ -53,6 +62,9 @@ typedef enum {
|
|
53
62
|
PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN
|
54
63
|
} pm_lex_state_t;
|
55
64
|
|
65
|
+
/**
|
66
|
+
* The type of quote that a heredoc uses.
|
67
|
+
*/
|
56
68
|
typedef enum {
|
57
69
|
PM_HEREDOC_QUOTE_NONE,
|
58
70
|
PM_HEREDOC_QUOTE_SINGLE = '\'',
|
@@ -60,386 +72,619 @@ typedef enum {
|
|
60
72
|
PM_HEREDOC_QUOTE_BACKTICK = '`',
|
61
73
|
} pm_heredoc_quote_t;
|
62
74
|
|
75
|
+
/**
|
76
|
+
* The type of indentation that a heredoc uses.
|
77
|
+
*/
|
63
78
|
typedef enum {
|
64
79
|
PM_HEREDOC_INDENT_NONE,
|
65
80
|
PM_HEREDOC_INDENT_DASH,
|
66
81
|
PM_HEREDOC_INDENT_TILDE,
|
67
82
|
} pm_heredoc_indent_t;
|
68
83
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
84
|
+
/**
|
85
|
+
* When lexing Ruby source, the lexer has a small amount of state to tell which
|
86
|
+
* kind of token it is currently lexing. For example, when we find the start of
|
87
|
+
* a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
|
88
|
+
* that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
|
89
|
+
* are found as part of a string.
|
90
|
+
*/
|
74
91
|
typedef struct pm_lex_mode {
|
92
|
+
/** The type of this lex mode. */
|
75
93
|
enum {
|
76
|
-
|
94
|
+
/** This state is used when any given token is being lexed. */
|
77
95
|
PM_LEX_DEFAULT,
|
78
96
|
|
79
|
-
|
80
|
-
|
97
|
+
/**
|
98
|
+
* This state is used when we're lexing as normal but inside an embedded
|
99
|
+
* expression of a string.
|
100
|
+
*/
|
81
101
|
PM_LEX_EMBEXPR,
|
82
102
|
|
83
|
-
|
84
|
-
|
103
|
+
/**
|
104
|
+
* This state is used when we're lexing a variable that is embedded
|
105
|
+
* directly inside of a string with the # shorthand.
|
106
|
+
*/
|
85
107
|
PM_LEX_EMBVAR,
|
86
108
|
|
87
|
-
|
109
|
+
/** This state is used when you are inside the content of a heredoc. */
|
88
110
|
PM_LEX_HEREDOC,
|
89
111
|
|
90
|
-
|
91
|
-
|
112
|
+
/**
|
113
|
+
* This state is used when we are lexing a list of tokens, as in a %w
|
114
|
+
* word list literal or a %i symbol list literal.
|
115
|
+
*/
|
92
116
|
PM_LEX_LIST,
|
93
117
|
|
94
|
-
|
95
|
-
|
118
|
+
/**
|
119
|
+
* This state is used when a regular expression has been begun and we
|
120
|
+
* are looking for the terminator.
|
121
|
+
*/
|
96
122
|
PM_LEX_REGEXP,
|
97
123
|
|
98
|
-
|
99
|
-
|
124
|
+
/**
|
125
|
+
* This state is used when we are lexing a string or a string-like
|
126
|
+
* token, as in string content with either quote or an xstring.
|
127
|
+
*/
|
100
128
|
PM_LEX_STRING
|
101
129
|
} mode;
|
102
130
|
|
131
|
+
/** The data associated with this type of lex mode. */
|
103
132
|
union {
|
104
133
|
struct {
|
105
|
-
|
134
|
+
/** This keeps track of the nesting level of the list. */
|
106
135
|
size_t nesting;
|
107
136
|
|
108
|
-
|
137
|
+
/** Whether or not interpolation is allowed in this list. */
|
109
138
|
bool interpolation;
|
110
139
|
|
111
|
-
|
112
|
-
|
140
|
+
/**
|
141
|
+
* When lexing a list, it takes into account balancing the
|
142
|
+
* terminator if the terminator is one of (), [], {}, or <>.
|
143
|
+
*/
|
113
144
|
uint8_t incrementor;
|
114
145
|
|
115
|
-
|
146
|
+
/** This is the terminator of the list literal. */
|
116
147
|
uint8_t terminator;
|
117
148
|
|
118
|
-
|
119
|
-
|
149
|
+
/**
|
150
|
+
* This is the character set that should be used to delimit the
|
151
|
+
* tokens within the list.
|
152
|
+
*/
|
120
153
|
uint8_t breakpoints[11];
|
121
154
|
} list;
|
122
155
|
|
123
156
|
struct {
|
124
|
-
|
157
|
+
/**
|
158
|
+
* This keeps track of the nesting level of the regular expression.
|
159
|
+
*/
|
125
160
|
size_t nesting;
|
126
161
|
|
127
|
-
|
128
|
-
|
162
|
+
/**
|
163
|
+
* When lexing a regular expression, it takes into account balancing
|
164
|
+
* the terminator if the terminator is one of (), [], {}, or <>.
|
165
|
+
*/
|
129
166
|
uint8_t incrementor;
|
130
167
|
|
131
|
-
|
168
|
+
/** This is the terminator of the regular expression. */
|
132
169
|
uint8_t terminator;
|
133
170
|
|
134
|
-
|
135
|
-
|
171
|
+
/**
|
172
|
+
* This is the character set that should be used to delimit the
|
173
|
+
* tokens within the regular expression.
|
174
|
+
*/
|
136
175
|
uint8_t breakpoints[6];
|
137
176
|
} regexp;
|
138
177
|
|
139
178
|
struct {
|
140
|
-
|
179
|
+
/** This keeps track of the nesting level of the string. */
|
141
180
|
size_t nesting;
|
142
181
|
|
143
|
-
|
182
|
+
/** Whether or not interpolation is allowed in this string. */
|
144
183
|
bool interpolation;
|
145
184
|
|
146
|
-
|
147
|
-
|
148
|
-
|
185
|
+
/**
|
186
|
+
* Whether or not at the end of the string we should allow a :,
|
187
|
+
* which would indicate this was a dynamic symbol instead of a
|
188
|
+
* string.
|
189
|
+
*/
|
149
190
|
bool label_allowed;
|
150
191
|
|
151
|
-
|
152
|
-
|
192
|
+
/**
|
193
|
+
* When lexing a string, it takes into account balancing the
|
194
|
+
* terminator if the terminator is one of (), [], {}, or <>.
|
195
|
+
*/
|
153
196
|
uint8_t incrementor;
|
154
197
|
|
155
|
-
|
156
|
-
|
198
|
+
/**
|
199
|
+
* This is the terminator of the string. It is typically either a
|
200
|
+
* single or double quote.
|
201
|
+
*/
|
157
202
|
uint8_t terminator;
|
158
203
|
|
159
|
-
|
160
|
-
|
204
|
+
/**
|
205
|
+
* This is the character set that should be used to delimit the
|
206
|
+
* tokens within the string.
|
207
|
+
*/
|
161
208
|
uint8_t breakpoints[6];
|
162
209
|
} string;
|
163
210
|
|
164
211
|
struct {
|
165
|
-
|
166
|
-
// identifier.
|
212
|
+
/** A pointer to the start of the heredoc identifier. */
|
167
213
|
const uint8_t *ident_start;
|
214
|
+
|
215
|
+
/** The length of the heredoc identifier. */
|
168
216
|
size_t ident_length;
|
169
217
|
|
218
|
+
/** The type of quote that the heredoc uses. */
|
170
219
|
pm_heredoc_quote_t quote;
|
220
|
+
|
221
|
+
/** The type of indentation that the heredoc uses. */
|
171
222
|
pm_heredoc_indent_t indent;
|
172
223
|
|
173
|
-
|
174
|
-
|
224
|
+
/**
|
225
|
+
* This is the pointer to the character where lexing should resume
|
226
|
+
* once the heredoc has been completely processed.
|
227
|
+
*/
|
175
228
|
const uint8_t *next_start;
|
176
229
|
|
177
|
-
|
178
|
-
|
179
|
-
|
230
|
+
/**
|
231
|
+
* This is used to track the amount of common whitespace on each
|
232
|
+
* line so that we know how much to dedent each line in the case of
|
233
|
+
* a tilde heredoc.
|
234
|
+
*/
|
180
235
|
size_t common_whitespace;
|
181
236
|
} heredoc;
|
182
237
|
} as;
|
183
238
|
|
184
|
-
|
239
|
+
/** The previous lex state so that it knows how to pop. */
|
185
240
|
struct pm_lex_mode *prev;
|
186
241
|
} pm_lex_mode_t;
|
187
242
|
|
188
|
-
|
189
|
-
|
190
|
-
|
243
|
+
/**
|
244
|
+
* We pre-allocate a certain number of lex states in order to avoid having to
|
245
|
+
* call malloc too many times while parsing. You really shouldn't need more than
|
246
|
+
* this because you only really nest deeply when doing string interpolation.
|
247
|
+
*/
|
191
248
|
#define PM_LEX_STACK_SIZE 4
|
192
249
|
|
193
|
-
|
194
|
-
|
250
|
+
/**
|
251
|
+
* The parser used to parse Ruby source.
|
252
|
+
*/
|
195
253
|
typedef struct pm_parser pm_parser_t;
|
196
254
|
|
197
|
-
|
198
|
-
|
199
|
-
|
255
|
+
/**
|
256
|
+
* While parsing, we keep track of a stack of contexts. This is helpful for
|
257
|
+
* error recovery so that we can pop back to a previous context when we hit a
|
258
|
+
* token that is understood by a parent context but not by the current context.
|
259
|
+
*/
|
200
260
|
typedef enum {
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
261
|
+
/** a begin statement */
|
262
|
+
PM_CONTEXT_BEGIN,
|
263
|
+
|
264
|
+
/** expressions in block arguments using braces */
|
265
|
+
PM_CONTEXT_BLOCK_BRACES,
|
266
|
+
|
267
|
+
/** expressions in block arguments using do..end */
|
268
|
+
PM_CONTEXT_BLOCK_KEYWORDS,
|
269
|
+
|
270
|
+
/** a case when statements */
|
271
|
+
PM_CONTEXT_CASE_WHEN,
|
272
|
+
|
273
|
+
/** a case in statements */
|
274
|
+
PM_CONTEXT_CASE_IN,
|
275
|
+
|
276
|
+
/** a class declaration */
|
277
|
+
PM_CONTEXT_CLASS,
|
278
|
+
|
279
|
+
/** a method definition */
|
280
|
+
PM_CONTEXT_DEF,
|
281
|
+
|
282
|
+
/** a method definition's parameters */
|
283
|
+
PM_CONTEXT_DEF_PARAMS,
|
284
|
+
|
285
|
+
/** a method definition's default parameter */
|
286
|
+
PM_CONTEXT_DEFAULT_PARAMS,
|
287
|
+
|
288
|
+
/** an else clause */
|
289
|
+
PM_CONTEXT_ELSE,
|
290
|
+
|
291
|
+
/** an elsif clause */
|
292
|
+
PM_CONTEXT_ELSIF,
|
293
|
+
|
294
|
+
/** an interpolated expression */
|
295
|
+
PM_CONTEXT_EMBEXPR,
|
296
|
+
|
297
|
+
/** an ensure statement */
|
298
|
+
PM_CONTEXT_ENSURE,
|
299
|
+
|
300
|
+
/** a for loop */
|
301
|
+
PM_CONTEXT_FOR,
|
302
|
+
|
303
|
+
/** a for loop's index */
|
304
|
+
PM_CONTEXT_FOR_INDEX,
|
305
|
+
|
306
|
+
/** an if statement */
|
307
|
+
PM_CONTEXT_IF,
|
308
|
+
|
309
|
+
/** a lambda expression with braces */
|
310
|
+
PM_CONTEXT_LAMBDA_BRACES,
|
311
|
+
|
312
|
+
/** a lambda expression with do..end */
|
313
|
+
PM_CONTEXT_LAMBDA_DO_END,
|
314
|
+
|
315
|
+
/** the top level context */
|
316
|
+
PM_CONTEXT_MAIN,
|
317
|
+
|
318
|
+
/** a module declaration */
|
319
|
+
PM_CONTEXT_MODULE,
|
320
|
+
|
321
|
+
/** a parenthesized expression */
|
322
|
+
PM_CONTEXT_PARENS,
|
323
|
+
|
324
|
+
/** an END block */
|
325
|
+
PM_CONTEXT_POSTEXE,
|
326
|
+
|
327
|
+
/** a predicate inside an if/elsif/unless statement */
|
328
|
+
PM_CONTEXT_PREDICATE,
|
329
|
+
|
330
|
+
/** a BEGIN block */
|
331
|
+
PM_CONTEXT_PREEXE,
|
332
|
+
|
333
|
+
/** a rescue else statement */
|
334
|
+
PM_CONTEXT_RESCUE_ELSE,
|
335
|
+
|
336
|
+
/** a rescue statement */
|
337
|
+
PM_CONTEXT_RESCUE,
|
338
|
+
|
339
|
+
/** a singleton class definition */
|
340
|
+
PM_CONTEXT_SCLASS,
|
341
|
+
|
342
|
+
/** an unless statement */
|
343
|
+
PM_CONTEXT_UNLESS,
|
344
|
+
|
345
|
+
/** an until statement */
|
346
|
+
PM_CONTEXT_UNTIL,
|
347
|
+
|
348
|
+
/** a while statement */
|
349
|
+
PM_CONTEXT_WHILE,
|
231
350
|
} pm_context_t;
|
232
351
|
|
233
|
-
|
352
|
+
/** This is a node in a linked list of contexts. */
|
234
353
|
typedef struct pm_context_node {
|
354
|
+
/** The context that this node represents. */
|
235
355
|
pm_context_t context;
|
356
|
+
|
357
|
+
/** A pointer to the previous context in the linked list. */
|
236
358
|
struct pm_context_node *prev;
|
237
359
|
} pm_context_node_t;
|
238
360
|
|
239
|
-
|
361
|
+
/** This is the type of a comment that we've found while parsing. */
|
240
362
|
typedef enum {
|
241
363
|
PM_COMMENT_INLINE,
|
242
364
|
PM_COMMENT_EMBDOC,
|
243
365
|
PM_COMMENT___END__
|
244
366
|
} pm_comment_type_t;
|
245
367
|
|
246
|
-
|
368
|
+
/**
|
369
|
+
* This is a node in the linked list of comments that we've found while parsing.
|
370
|
+
*
|
371
|
+
* @extends pm_list_node_t
|
372
|
+
*/
|
247
373
|
typedef struct pm_comment {
|
374
|
+
/** The embedded base node. */
|
248
375
|
pm_list_node_t node;
|
376
|
+
|
377
|
+
/** A pointer to the start of the comment in the source. */
|
249
378
|
const uint8_t *start;
|
379
|
+
|
380
|
+
/** A pointer to the end of the comment in the source. */
|
250
381
|
const uint8_t *end;
|
382
|
+
|
383
|
+
/** The type of comment that we've found. */
|
251
384
|
pm_comment_type_t type;
|
252
385
|
} pm_comment_t;
|
253
386
|
|
254
|
-
|
255
|
-
|
387
|
+
/**
|
388
|
+
* This is a node in the linked list of magic comments that we've found while
|
389
|
+
* parsing.
|
390
|
+
*
|
391
|
+
* @extends pm_list_node_t
|
392
|
+
*/
|
256
393
|
typedef struct {
|
394
|
+
/** The embedded base node. */
|
257
395
|
pm_list_node_t node;
|
396
|
+
|
397
|
+
/** A pointer to the start of the key in the source. */
|
258
398
|
const uint8_t *key_start;
|
399
|
+
|
400
|
+
/** A pointer to the start of the value in the source. */
|
259
401
|
const uint8_t *value_start;
|
402
|
+
|
403
|
+
/** The length of the key in the source. */
|
260
404
|
uint32_t key_length;
|
405
|
+
|
406
|
+
/** The length of the value in the source. */
|
261
407
|
uint32_t value_length;
|
262
408
|
} pm_magic_comment_t;
|
263
409
|
|
264
|
-
|
265
|
-
|
410
|
+
/**
|
411
|
+
* When the encoding that is being used to parse the source is changed by prism,
|
412
|
+
* we provide the ability here to call out to a user-defined function.
|
413
|
+
*/
|
266
414
|
typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser);
|
267
415
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
416
|
+
/**
|
417
|
+
* When an encoding is encountered that isn't understood by prism, we provide
|
418
|
+
* the ability here to call out to a user-defined function to get an encoding
|
419
|
+
* struct. If the function returns something that isn't NULL, we set that to
|
420
|
+
* our encoding and use it to parse identifiers.
|
421
|
+
*/
|
272
422
|
typedef pm_encoding_t *(*pm_encoding_decode_callback_t)(pm_parser_t *parser, const uint8_t *name, size_t width);
|
273
423
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
424
|
+
/**
|
425
|
+
* When you are lexing through a file, the lexer needs all of the information
|
426
|
+
* that the parser additionally provides (for example, the local table). So if
|
427
|
+
* you want to properly lex Ruby, you need to actually lex it in the context of
|
428
|
+
* the parser. In order to provide this functionality, we optionally allow a
|
429
|
+
* struct to be attached to the parser that calls back out to a user-provided
|
430
|
+
* callback when each token is lexed.
|
431
|
+
*/
|
280
432
|
typedef struct {
|
281
|
-
|
282
|
-
|
283
|
-
|
433
|
+
/**
|
434
|
+
* This opaque pointer is used to provide whatever information the user
|
435
|
+
* deemed necessary to the callback. In our case we use it to pass the array
|
436
|
+
* that the tokens get appended into.
|
437
|
+
*/
|
284
438
|
void *data;
|
285
439
|
|
286
|
-
|
287
|
-
|
440
|
+
/**
|
441
|
+
* This is the callback that is called when a token is lexed. It is passed
|
442
|
+
* the opaque data pointer, the parser, and the token that was lexed.
|
443
|
+
*/
|
288
444
|
void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token);
|
289
445
|
} pm_lex_callback_t;
|
290
446
|
|
291
|
-
|
292
|
-
|
447
|
+
/**
|
448
|
+
* This struct represents a node in a linked list of scopes. Some scopes can see
|
449
|
+
* into their parent scopes, while others cannot.
|
450
|
+
*/
|
293
451
|
typedef struct pm_scope {
|
294
|
-
|
452
|
+
/** The IDs of the locals in the given scope. */
|
295
453
|
pm_constant_id_list_t locals;
|
296
454
|
|
297
|
-
|
455
|
+
/** A pointer to the previous scope in the linked list. */
|
298
456
|
struct pm_scope *previous;
|
299
457
|
|
300
|
-
|
301
|
-
|
458
|
+
/**
|
459
|
+
* A boolean indicating whether or not this scope can see into its parent.
|
460
|
+
* If closed is true, then the scope cannot see into its parent.
|
461
|
+
*/
|
302
462
|
bool closed;
|
303
463
|
|
304
|
-
|
305
|
-
|
306
|
-
|
464
|
+
/**
|
465
|
+
* A boolean indicating whether or not this scope has explicit parameters.
|
466
|
+
* This is necessary to determine whether or not numbered parameters are
|
467
|
+
* allowed.
|
468
|
+
*/
|
307
469
|
bool explicit_params;
|
308
470
|
|
309
|
-
|
310
|
-
|
311
|
-
|
471
|
+
/**
|
472
|
+
* A boolean indicating whether or not this scope has numbered parameters.
|
473
|
+
* This is necessary to determine if child blocks are allowed to use
|
474
|
+
* numbered parameters.
|
475
|
+
*/
|
312
476
|
bool numbered_params;
|
313
477
|
|
314
|
-
|
315
|
-
|
316
|
-
|
478
|
+
/**
|
479
|
+
* A transparent scope is a scope that cannot have locals set on itself.
|
480
|
+
* When a local is set on this scope, it will instead be set on the parent
|
481
|
+
* scope's local table.
|
482
|
+
*/
|
317
483
|
bool transparent;
|
318
484
|
} pm_scope_t;
|
319
485
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
486
|
+
/**
|
487
|
+
* This struct represents the overall parser. It contains a reference to the
|
488
|
+
* source file, as well as pointers that indicate where in the source it's
|
489
|
+
* currently parsing. It also contains the most recent and current token that
|
490
|
+
* it's considering.
|
491
|
+
*/
|
324
492
|
struct pm_parser {
|
325
|
-
|
326
|
-
|
493
|
+
/** The current state of the lexer. */
|
494
|
+
pm_lex_state_t lex_state;
|
495
|
+
|
496
|
+
/** Tracks the current nesting of (), [], and {}. */
|
497
|
+
int enclosure_nesting;
|
327
498
|
|
328
|
-
|
329
|
-
|
499
|
+
/**
|
500
|
+
* Used to temporarily track the nesting of enclosures to determine if a {
|
501
|
+
* is the beginning of a lambda following the parameters of a lambda.
|
502
|
+
*/
|
330
503
|
int lambda_enclosure_nesting;
|
331
504
|
|
332
|
-
|
333
|
-
|
505
|
+
/**
|
506
|
+
* Used to track the nesting of braces to ensure we get the correct value
|
507
|
+
* when we are interpolating blocks with braces.
|
508
|
+
*/
|
334
509
|
int brace_nesting;
|
335
510
|
|
336
|
-
|
337
|
-
|
511
|
+
/**
|
512
|
+
* The stack used to determine if a do keyword belongs to the predicate of a
|
513
|
+
* while, until, or for loop.
|
514
|
+
*/
|
338
515
|
pm_state_stack_t do_loop_stack;
|
339
516
|
|
340
|
-
|
341
|
-
|
517
|
+
/**
|
518
|
+
* The stack used to determine if a do keyword belongs to the beginning of a
|
519
|
+
* block.
|
520
|
+
*/
|
342
521
|
pm_state_stack_t accepts_block_stack;
|
343
522
|
|
523
|
+
/** A stack of lex modes. */
|
344
524
|
struct {
|
345
|
-
|
346
|
-
pm_lex_mode_t
|
347
|
-
|
525
|
+
/** The current mode of the lexer. */
|
526
|
+
pm_lex_mode_t *current;
|
527
|
+
|
528
|
+
/** The stack of lexer modes. */
|
529
|
+
pm_lex_mode_t stack[PM_LEX_STACK_SIZE];
|
530
|
+
|
531
|
+
/** The current index into the lexer mode stack. */
|
532
|
+
size_t index;
|
348
533
|
} lex_modes;
|
349
534
|
|
350
|
-
|
351
|
-
const uint8_t *
|
352
|
-
|
353
|
-
|
535
|
+
/** The pointer to the start of the source. */
|
536
|
+
const uint8_t *start;
|
537
|
+
|
538
|
+
/** The pointer to the end of the source. */
|
539
|
+
const uint8_t *end;
|
540
|
+
|
541
|
+
/** The previous token we were considering. */
|
542
|
+
pm_token_t previous;
|
543
|
+
|
544
|
+
/** The current token we're considering. */
|
545
|
+
pm_token_t current;
|
354
546
|
|
355
|
-
|
356
|
-
|
357
|
-
|
547
|
+
/**
|
548
|
+
* This is a special field set on the parser when we need the parser to jump
|
549
|
+
* to a specific location when lexing the next token, as opposed to just
|
550
|
+
* using the end of the previous token. Normally this is NULL.
|
551
|
+
*/
|
358
552
|
const uint8_t *next_start;
|
359
553
|
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
554
|
+
/**
|
555
|
+
* This field indicates the end of a heredoc whose identifier was found on
|
556
|
+
* the current line. If another heredoc is found on the same line, then this
|
557
|
+
* will be moved forward to the end of that heredoc. If no heredocs are
|
558
|
+
* found on a line then this is NULL.
|
559
|
+
*/
|
364
560
|
const uint8_t *heredoc_end;
|
365
561
|
|
366
|
-
|
367
|
-
pm_list_t
|
368
|
-
pm_list_t warning_list; // the list of warnings that have been found while parsing
|
369
|
-
pm_list_t error_list; // the list of errors that have been found while parsing
|
370
|
-
pm_scope_t *current_scope; // the current local scope
|
562
|
+
/** The list of comments that have been found while parsing. */
|
563
|
+
pm_list_t comment_list;
|
371
564
|
|
372
|
-
|
565
|
+
/** The list of magic comments that have been found while parsing. */
|
566
|
+
pm_list_t magic_comment_list;
|
373
567
|
|
374
|
-
|
375
|
-
|
568
|
+
/** The list of warnings that have been found while parsing. */
|
569
|
+
pm_list_t warning_list;
|
570
|
+
|
571
|
+
/** The list of errors that have been found while parsing. */
|
572
|
+
pm_list_t error_list;
|
573
|
+
|
574
|
+
/** The current local scope. */
|
575
|
+
pm_scope_t *current_scope;
|
576
|
+
|
577
|
+
/** The current parsing context. */
|
578
|
+
pm_context_node_t *current_context;
|
579
|
+
|
580
|
+
/**
|
581
|
+
* The encoding functions for the current file is attached to the parser as
|
582
|
+
* it's parsing so that it can change with a magic comment.
|
583
|
+
*/
|
376
584
|
pm_encoding_t encoding;
|
377
585
|
|
378
|
-
|
379
|
-
|
380
|
-
|
586
|
+
/**
|
587
|
+
* When the encoding that is being used to parse the source is changed by
|
588
|
+
* prism, we provide the ability here to call out to a user-defined
|
589
|
+
* function.
|
590
|
+
*/
|
381
591
|
pm_encoding_changed_callback_t encoding_changed_callback;
|
382
592
|
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
593
|
+
/**
|
594
|
+
* When an encoding is encountered that isn't understood by prism, we
|
595
|
+
* provide the ability here to call out to a user-defined function to get an
|
596
|
+
* encoding struct. If the function returns something that isn't NULL, we
|
597
|
+
* set that to our encoding and use it to parse identifiers.
|
598
|
+
*/
|
387
599
|
pm_encoding_decode_callback_t encoding_decode_callback;
|
388
600
|
|
389
|
-
|
390
|
-
|
601
|
+
/**
|
602
|
+
* This pointer indicates where a comment must start if it is to be
|
603
|
+
* considered an encoding comment.
|
604
|
+
*/
|
391
605
|
const uint8_t *encoding_comment_start;
|
392
606
|
|
393
|
-
|
394
|
-
|
607
|
+
/**
|
608
|
+
* This is an optional callback that can be attached to the parser that will
|
609
|
+
* be called whenever a new token is lexed by the parser.
|
610
|
+
*/
|
395
611
|
pm_lex_callback_t *lex_callback;
|
396
612
|
|
397
|
-
|
398
|
-
|
613
|
+
/**
|
614
|
+
* This is the path of the file being parsed. We use the filepath when
|
615
|
+
* constructing SourceFileNodes.
|
616
|
+
*/
|
399
617
|
pm_string_t filepath_string;
|
400
618
|
|
401
|
-
|
402
|
-
|
619
|
+
/**
|
620
|
+
* This constant pool keeps all of the constants defined throughout the file
|
621
|
+
* so that we can reference them later.
|
622
|
+
*/
|
403
623
|
pm_constant_pool_t constant_pool;
|
404
624
|
|
405
|
-
|
625
|
+
/** This is the list of newline offsets in the source file. */
|
406
626
|
pm_newline_list_t newline_list;
|
407
627
|
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
628
|
+
/**
|
629
|
+
* We want to add a flag to integer nodes that indicates their base. We only
|
630
|
+
* want to parse these once, but we don't have space on the token itself to
|
631
|
+
* communicate this information. So we store it here and pass it through
|
632
|
+
* when we find tokens that we need it for.
|
633
|
+
*/
|
412
634
|
pm_node_flags_t integer_base;
|
413
635
|
|
414
|
-
|
415
|
-
|
636
|
+
/**
|
637
|
+
* This string is used to pass information from the lexer to the parser. It
|
638
|
+
* is particularly necessary because of escape sequences.
|
639
|
+
*/
|
416
640
|
pm_string_t current_string;
|
417
641
|
|
418
|
-
|
642
|
+
/**
|
643
|
+
* The line number at the start of the parse. This will be used to offset
|
644
|
+
* the line numbers of all of the locations.
|
645
|
+
*/
|
646
|
+
uint32_t start_line;
|
647
|
+
|
648
|
+
/** Whether or not we're at the beginning of a command. */
|
419
649
|
bool command_start;
|
420
650
|
|
421
|
-
|
651
|
+
/** Whether or not we're currently recovering from a syntax error. */
|
422
652
|
bool recovering;
|
423
653
|
|
424
|
-
|
425
|
-
|
426
|
-
|
654
|
+
/**
|
655
|
+
* Whether or not the encoding has been changed by a magic comment. We use
|
656
|
+
* this to provide a fast path for the lexer instead of going through the
|
657
|
+
* function pointer.
|
658
|
+
*/
|
427
659
|
bool encoding_changed;
|
428
660
|
|
429
|
-
|
430
|
-
|
661
|
+
/**
|
662
|
+
* This flag indicates that we are currently parsing a pattern matching
|
663
|
+
* expression and impacts that calculation of newlines.
|
664
|
+
*/
|
431
665
|
bool pattern_matching_newlines;
|
432
666
|
|
433
|
-
|
667
|
+
/** This flag indicates that we are currently parsing a keyword argument. */
|
434
668
|
bool in_keyword_arg;
|
435
669
|
|
436
|
-
|
437
|
-
|
670
|
+
/**
|
671
|
+
* Whether or not the parser has seen a token that has semantic meaning
|
672
|
+
* (i.e., a token that is not a comment or whitespace).
|
673
|
+
*/
|
438
674
|
bool semantic_token_seen;
|
439
675
|
|
440
|
-
|
441
|
-
|
676
|
+
/**
|
677
|
+
* Whether or not we have found a frozen_string_literal magic comment with
|
678
|
+
* a true value.
|
679
|
+
*/
|
442
680
|
bool frozen_string_literal;
|
681
|
+
|
682
|
+
/**
|
683
|
+
* Whether or not we should emit warnings. This will be set to false if the
|
684
|
+
* consumer of the library specified it, usually because they are parsing
|
685
|
+
* when $VERBOSE is nil.
|
686
|
+
*/
|
687
|
+
bool suppress_warnings;
|
443
688
|
};
|
444
689
|
|
445
|
-
#endif
|
690
|
+
#endif
|