prism 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +172 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +62 -0
- data/LICENSE.md +7 -0
- data/Makefile +84 -0
- data/README.md +89 -0
- data/config.yml +2481 -0
- data/docs/build_system.md +74 -0
- data/docs/building.md +22 -0
- data/docs/configuration.md +60 -0
- data/docs/design.md +53 -0
- data/docs/encoding.md +117 -0
- data/docs/fuzzing.md +93 -0
- data/docs/heredocs.md +36 -0
- data/docs/mapping.md +117 -0
- data/docs/ripper.md +36 -0
- data/docs/ruby_api.md +25 -0
- data/docs/serialization.md +181 -0
- data/docs/testing.md +55 -0
- data/ext/prism/api_node.c +4725 -0
- data/ext/prism/api_pack.c +256 -0
- data/ext/prism/extconf.rb +136 -0
- data/ext/prism/extension.c +626 -0
- data/ext/prism/extension.h +18 -0
- data/include/prism/ast.h +1932 -0
- data/include/prism/defines.h +45 -0
- data/include/prism/diagnostic.h +231 -0
- data/include/prism/enc/pm_encoding.h +95 -0
- data/include/prism/node.h +41 -0
- data/include/prism/pack.h +141 -0
- data/include/prism/parser.h +418 -0
- data/include/prism/regexp.h +19 -0
- data/include/prism/unescape.h +48 -0
- data/include/prism/util/pm_buffer.h +51 -0
- data/include/prism/util/pm_char.h +91 -0
- data/include/prism/util/pm_constant_pool.h +78 -0
- data/include/prism/util/pm_list.h +67 -0
- data/include/prism/util/pm_memchr.h +14 -0
- data/include/prism/util/pm_newline_list.h +61 -0
- data/include/prism/util/pm_state_stack.h +24 -0
- data/include/prism/util/pm_string.h +61 -0
- data/include/prism/util/pm_string_list.h +25 -0
- data/include/prism/util/pm_strpbrk.h +29 -0
- data/include/prism/version.h +4 -0
- data/include/prism.h +82 -0
- data/lib/prism/compiler.rb +465 -0
- data/lib/prism/debug.rb +157 -0
- data/lib/prism/desugar_compiler.rb +206 -0
- data/lib/prism/dispatcher.rb +2051 -0
- data/lib/prism/dsl.rb +750 -0
- data/lib/prism/ffi.rb +251 -0
- data/lib/prism/lex_compat.rb +838 -0
- data/lib/prism/mutation_compiler.rb +718 -0
- data/lib/prism/node.rb +14540 -0
- data/lib/prism/node_ext.rb +55 -0
- data/lib/prism/node_inspector.rb +68 -0
- data/lib/prism/pack.rb +185 -0
- data/lib/prism/parse_result/comments.rb +172 -0
- data/lib/prism/parse_result/newlines.rb +60 -0
- data/lib/prism/parse_result.rb +266 -0
- data/lib/prism/pattern.rb +239 -0
- data/lib/prism/ripper_compat.rb +174 -0
- data/lib/prism/serialize.rb +662 -0
- data/lib/prism/visitor.rb +470 -0
- data/lib/prism.rb +64 -0
- data/prism.gemspec +113 -0
- data/src/diagnostic.c +287 -0
- data/src/enc/pm_big5.c +52 -0
- data/src/enc/pm_euc_jp.c +58 -0
- data/src/enc/pm_gbk.c +61 -0
- data/src/enc/pm_shift_jis.c +56 -0
- data/src/enc/pm_tables.c +507 -0
- data/src/enc/pm_unicode.c +2324 -0
- data/src/enc/pm_windows_31j.c +56 -0
- data/src/node.c +2633 -0
- data/src/pack.c +493 -0
- data/src/prettyprint.c +2136 -0
- data/src/prism.c +14587 -0
- data/src/regexp.c +580 -0
- data/src/serialize.c +1899 -0
- data/src/token_type.c +349 -0
- data/src/unescape.c +637 -0
- data/src/util/pm_buffer.c +103 -0
- data/src/util/pm_char.c +272 -0
- data/src/util/pm_constant_pool.c +252 -0
- data/src/util/pm_list.c +41 -0
- data/src/util/pm_memchr.c +33 -0
- data/src/util/pm_newline_list.c +134 -0
- data/src/util/pm_state_stack.c +19 -0
- data/src/util/pm_string.c +200 -0
- data/src/util/pm_string_list.c +29 -0
- data/src/util/pm_strncasecmp.c +17 -0
- data/src/util/pm_strpbrk.c +66 -0
- metadata +138 -0
@@ -0,0 +1,418 @@
|
|
1
|
+
#ifndef PRISM_PARSER_H
|
2
|
+
#define PRISM_PARSER_H
|
3
|
+
|
4
|
+
#include "prism/ast.h"
|
5
|
+
#include "prism/defines.h"
|
6
|
+
#include "prism/enc/pm_encoding.h"
|
7
|
+
#include "prism/util/pm_constant_pool.h"
|
8
|
+
#include "prism/util/pm_list.h"
|
9
|
+
#include "prism/util/pm_newline_list.h"
|
10
|
+
#include "prism/util/pm_state_stack.h"
|
11
|
+
|
12
|
+
#include <stdbool.h>
|
13
|
+
|
14
|
+
// This enum provides various bits that represent different kinds of states that
|
15
|
+
// the lexer can track. This is used to determine which kind of token to return
|
16
|
+
// based on the context of the parser.
|
17
|
+
typedef enum {
|
18
|
+
PM_LEX_STATE_BIT_BEG,
|
19
|
+
PM_LEX_STATE_BIT_END,
|
20
|
+
PM_LEX_STATE_BIT_ENDARG,
|
21
|
+
PM_LEX_STATE_BIT_ENDFN,
|
22
|
+
PM_LEX_STATE_BIT_ARG,
|
23
|
+
PM_LEX_STATE_BIT_CMDARG,
|
24
|
+
PM_LEX_STATE_BIT_MID,
|
25
|
+
PM_LEX_STATE_BIT_FNAME,
|
26
|
+
PM_LEX_STATE_BIT_DOT,
|
27
|
+
PM_LEX_STATE_BIT_CLASS,
|
28
|
+
PM_LEX_STATE_BIT_LABEL,
|
29
|
+
PM_LEX_STATE_BIT_LABELED,
|
30
|
+
PM_LEX_STATE_BIT_FITEM
|
31
|
+
} pm_lex_state_bit_t;
|
32
|
+
|
33
|
+
// This enum combines the various bits from the above enum into individual
|
34
|
+
// values that represent the various states of the lexer.
|
35
|
+
typedef enum {
|
36
|
+
PM_LEX_STATE_NONE = 0,
|
37
|
+
PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG),
|
38
|
+
PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END),
|
39
|
+
PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG),
|
40
|
+
PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN),
|
41
|
+
PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG),
|
42
|
+
PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG),
|
43
|
+
PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID),
|
44
|
+
PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME),
|
45
|
+
PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT),
|
46
|
+
PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS),
|
47
|
+
PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL),
|
48
|
+
PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED),
|
49
|
+
PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM),
|
50
|
+
PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS,
|
51
|
+
PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG,
|
52
|
+
PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN
|
53
|
+
} pm_lex_state_t;
|
54
|
+
|
55
|
+
typedef enum {
|
56
|
+
PM_HEREDOC_QUOTE_NONE,
|
57
|
+
PM_HEREDOC_QUOTE_SINGLE = '\'',
|
58
|
+
PM_HEREDOC_QUOTE_DOUBLE = '"',
|
59
|
+
PM_HEREDOC_QUOTE_BACKTICK = '`',
|
60
|
+
} pm_heredoc_quote_t;
|
61
|
+
|
62
|
+
typedef enum {
|
63
|
+
PM_HEREDOC_INDENT_NONE,
|
64
|
+
PM_HEREDOC_INDENT_DASH,
|
65
|
+
PM_HEREDOC_INDENT_TILDE,
|
66
|
+
} pm_heredoc_indent_t;
|
67
|
+
|
68
|
+
// When lexing Ruby source, the lexer has a small amount of state to tell which
|
69
|
+
// kind of token it is currently lexing. For example, when we find the start of
|
70
|
+
// a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
|
71
|
+
// that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
|
72
|
+
// are found as part of a string.
|
73
|
+
typedef struct pm_lex_mode {
|
74
|
+
enum {
|
75
|
+
// This state is used when any given token is being lexed.
|
76
|
+
PM_LEX_DEFAULT,
|
77
|
+
|
78
|
+
// This state is used when we're lexing as normal but inside an embedded
|
79
|
+
// expression of a string.
|
80
|
+
PM_LEX_EMBEXPR,
|
81
|
+
|
82
|
+
// This state is used when we're lexing a variable that is embedded
|
83
|
+
// directly inside of a string with the # shorthand.
|
84
|
+
PM_LEX_EMBVAR,
|
85
|
+
|
86
|
+
// This state is used when you are inside the content of a heredoc.
|
87
|
+
PM_LEX_HEREDOC,
|
88
|
+
|
89
|
+
// This state is used when we are lexing a list of tokens, as in a %w
|
90
|
+
// word list literal or a %i symbol list literal.
|
91
|
+
PM_LEX_LIST,
|
92
|
+
|
93
|
+
// This state is used when a regular expression has been begun and we
|
94
|
+
// are looking for the terminator.
|
95
|
+
PM_LEX_REGEXP,
|
96
|
+
|
97
|
+
// This state is used when we are lexing a string or a string-like
|
98
|
+
// token, as in string content with either quote or an xstring.
|
99
|
+
PM_LEX_STRING
|
100
|
+
} mode;
|
101
|
+
|
102
|
+
union {
|
103
|
+
struct {
|
104
|
+
// This keeps track of the nesting level of the list.
|
105
|
+
size_t nesting;
|
106
|
+
|
107
|
+
// Whether or not interpolation is allowed in this list.
|
108
|
+
bool interpolation;
|
109
|
+
|
110
|
+
// When lexing a list, it takes into account balancing the
|
111
|
+
// terminator if the terminator is one of (), [], {}, or <>.
|
112
|
+
uint8_t incrementor;
|
113
|
+
|
114
|
+
// This is the terminator of the list literal.
|
115
|
+
uint8_t terminator;
|
116
|
+
|
117
|
+
// This is the character set that should be used to delimit the
|
118
|
+
// tokens within the list.
|
119
|
+
uint8_t breakpoints[11];
|
120
|
+
} list;
|
121
|
+
|
122
|
+
struct {
|
123
|
+
// This keeps track of the nesting level of the regular expression.
|
124
|
+
size_t nesting;
|
125
|
+
|
126
|
+
// When lexing a regular expression, it takes into account balancing
|
127
|
+
// the terminator if the terminator is one of (), [], {}, or <>.
|
128
|
+
uint8_t incrementor;
|
129
|
+
|
130
|
+
// This is the terminator of the regular expression.
|
131
|
+
uint8_t terminator;
|
132
|
+
|
133
|
+
// This is the character set that should be used to delimit the
|
134
|
+
// tokens within the regular expression.
|
135
|
+
uint8_t breakpoints[6];
|
136
|
+
} regexp;
|
137
|
+
|
138
|
+
struct {
|
139
|
+
// This keeps track of the nesting level of the string.
|
140
|
+
size_t nesting;
|
141
|
+
|
142
|
+
// Whether or not interpolation is allowed in this string.
|
143
|
+
bool interpolation;
|
144
|
+
|
145
|
+
// Whether or not at the end of the string we should allow a :,
|
146
|
+
// which would indicate this was a dynamic symbol instead of a
|
147
|
+
// string.
|
148
|
+
bool label_allowed;
|
149
|
+
|
150
|
+
// When lexing a string, it takes into account balancing the
|
151
|
+
// terminator if the terminator is one of (), [], {}, or <>.
|
152
|
+
uint8_t incrementor;
|
153
|
+
|
154
|
+
// This is the terminator of the string. It is typically either a
|
155
|
+
// single or double quote.
|
156
|
+
uint8_t terminator;
|
157
|
+
|
158
|
+
// This is the character set that should be used to delimit the
|
159
|
+
// tokens within the string.
|
160
|
+
uint8_t breakpoints[6];
|
161
|
+
} string;
|
162
|
+
|
163
|
+
struct {
|
164
|
+
// These pointers point to the beginning and end of the heredoc
|
165
|
+
// identifier.
|
166
|
+
const uint8_t *ident_start;
|
167
|
+
size_t ident_length;
|
168
|
+
|
169
|
+
pm_heredoc_quote_t quote;
|
170
|
+
pm_heredoc_indent_t indent;
|
171
|
+
|
172
|
+
// This is the pointer to the character where lexing should resume
|
173
|
+
// once the heredoc has been completely processed.
|
174
|
+
const uint8_t *next_start;
|
175
|
+
} heredoc;
|
176
|
+
} as;
|
177
|
+
|
178
|
+
// The previous lex state so that it knows how to pop.
|
179
|
+
struct pm_lex_mode *prev;
|
180
|
+
} pm_lex_mode_t;
|
181
|
+
|
182
|
+
// We pre-allocate a certain number of lex states in order to avoid having to
|
183
|
+
// call malloc too many times while parsing. You really shouldn't need more than
|
184
|
+
// this because you only really nest deeply when doing string interpolation.
|
185
|
+
#define PM_LEX_STACK_SIZE 4
|
186
|
+
|
187
|
+
// A forward declaration since our error handler struct accepts a parser for
|
188
|
+
// each of its function calls.
|
189
|
+
typedef struct pm_parser pm_parser_t;
|
190
|
+
|
191
|
+
// While parsing, we keep track of a stack of contexts. This is helpful for
|
192
|
+
// error recovery so that we can pop back to a previous context when we hit a
|
193
|
+
// token that is understood by a parent context but not by the current context.
|
194
|
+
typedef enum {
|
195
|
+
PM_CONTEXT_BEGIN, // a begin statement
|
196
|
+
PM_CONTEXT_BLOCK_BRACES, // expressions in block arguments using braces
|
197
|
+
PM_CONTEXT_BLOCK_KEYWORDS, // expressions in block arguments using do..end
|
198
|
+
PM_CONTEXT_CASE_WHEN, // a case when statements
|
199
|
+
PM_CONTEXT_CASE_IN, // a case in statements
|
200
|
+
PM_CONTEXT_CLASS, // a class declaration
|
201
|
+
PM_CONTEXT_DEF, // a method definition
|
202
|
+
PM_CONTEXT_DEF_PARAMS, // a method definition's parameters
|
203
|
+
PM_CONTEXT_DEFAULT_PARAMS, // a method definition's default parameter
|
204
|
+
PM_CONTEXT_ELSE, // an else clause
|
205
|
+
PM_CONTEXT_ELSIF, // an elsif clause
|
206
|
+
PM_CONTEXT_EMBEXPR, // an interpolated expression
|
207
|
+
PM_CONTEXT_ENSURE, // an ensure statement
|
208
|
+
PM_CONTEXT_FOR, // a for loop
|
209
|
+
PM_CONTEXT_IF, // an if statement
|
210
|
+
PM_CONTEXT_LAMBDA_BRACES, // a lambda expression with braces
|
211
|
+
PM_CONTEXT_LAMBDA_DO_END, // a lambda expression with do..end
|
212
|
+
PM_CONTEXT_MAIN, // the top level context
|
213
|
+
PM_CONTEXT_MODULE, // a module declaration
|
214
|
+
PM_CONTEXT_PARENS, // a parenthesized expression
|
215
|
+
PM_CONTEXT_POSTEXE, // an END block
|
216
|
+
PM_CONTEXT_PREDICATE, // a predicate inside an if/elsif/unless statement
|
217
|
+
PM_CONTEXT_PREEXE, // a BEGIN block
|
218
|
+
PM_CONTEXT_RESCUE_ELSE, // a rescue else statement
|
219
|
+
PM_CONTEXT_RESCUE, // a rescue statement
|
220
|
+
PM_CONTEXT_SCLASS, // a singleton class definition
|
221
|
+
PM_CONTEXT_UNLESS, // an unless statement
|
222
|
+
PM_CONTEXT_UNTIL, // an until statement
|
223
|
+
PM_CONTEXT_WHILE, // a while statement
|
224
|
+
} pm_context_t;
|
225
|
+
|
226
|
+
// This is a node in a linked list of contexts.
|
227
|
+
typedef struct pm_context_node {
|
228
|
+
pm_context_t context;
|
229
|
+
struct pm_context_node *prev;
|
230
|
+
} pm_context_node_t;
|
231
|
+
|
232
|
+
// This is the type of a comment that we've found while parsing.
|
233
|
+
typedef enum {
|
234
|
+
PM_COMMENT_INLINE,
|
235
|
+
PM_COMMENT_EMBDOC,
|
236
|
+
PM_COMMENT___END__
|
237
|
+
} pm_comment_type_t;
|
238
|
+
|
239
|
+
// This is a node in the linked list of comments that we've found while parsing.
|
240
|
+
typedef struct pm_comment {
|
241
|
+
pm_list_node_t node;
|
242
|
+
const uint8_t *start;
|
243
|
+
const uint8_t *end;
|
244
|
+
pm_comment_type_t type;
|
245
|
+
} pm_comment_t;
|
246
|
+
|
247
|
+
// When the encoding that is being used to parse the source is changed by prism,
|
248
|
+
// we provide the ability here to call out to a user-defined function.
|
249
|
+
typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser);
|
250
|
+
|
251
|
+
// When an encoding is encountered that isn't understood by prism, we provide
|
252
|
+
// the ability here to call out to a user-defined function to get an encoding
|
253
|
+
// struct. If the function returns something that isn't NULL, we set that to
|
254
|
+
// our encoding and use it to parse identifiers.
|
255
|
+
typedef pm_encoding_t *(*pm_encoding_decode_callback_t)(pm_parser_t *parser, const uint8_t *name, size_t width);
|
256
|
+
|
257
|
+
// When you are lexing through a file, the lexer needs all of the information
|
258
|
+
// that the parser additionally provides (for example, the local table). So if
|
259
|
+
// you want to properly lex Ruby, you need to actually lex it in the context of
|
260
|
+
// the parser. In order to provide this functionality, we optionally allow a
|
261
|
+
// struct to be attached to the parser that calls back out to a user-provided
|
262
|
+
// callback when each token is lexed.
|
263
|
+
typedef struct {
|
264
|
+
// This opaque pointer is used to provide whatever information the user
|
265
|
+
// deemed necessary to the callback. In our case we use it to pass the array
|
266
|
+
// that the tokens get appended into.
|
267
|
+
void *data;
|
268
|
+
|
269
|
+
// This is the callback that is called when a token is lexed. It is passed
|
270
|
+
// the opaque data pointer, the parser, and the token that was lexed.
|
271
|
+
void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token);
|
272
|
+
} pm_lex_callback_t;
|
273
|
+
|
274
|
+
// This struct represents a node in a linked list of scopes. Some scopes can see
|
275
|
+
// into their parent scopes, while others cannot.
|
276
|
+
typedef struct pm_scope {
|
277
|
+
// The IDs of the locals in the given scope.
|
278
|
+
pm_constant_id_list_t locals;
|
279
|
+
|
280
|
+
// A pointer to the previous scope in the linked list.
|
281
|
+
struct pm_scope *previous;
|
282
|
+
|
283
|
+
// A boolean indicating whether or not this scope can see into its parent.
|
284
|
+
// If closed is true, then the scope cannot see into its parent.
|
285
|
+
bool closed;
|
286
|
+
|
287
|
+
// A boolean indicating whether or not this scope has explicit parameters.
|
288
|
+
// This is necessary to determine whether or not numbered parameters are
|
289
|
+
// allowed.
|
290
|
+
bool explicit_params;
|
291
|
+
|
292
|
+
// A boolean indicating whether or not this scope has numbered parameters.
|
293
|
+
// This is necessary to determine if child blocks are allowed to use
|
294
|
+
// numbered parameters.
|
295
|
+
bool numbered_params;
|
296
|
+
} pm_scope_t;
|
297
|
+
|
298
|
+
// This struct represents the overall parser. It contains a reference to the
|
299
|
+
// source file, as well as pointers that indicate where in the source it's
|
300
|
+
// currently parsing. It also contains the most recent and current token that
|
301
|
+
// it's considering.
|
302
|
+
struct pm_parser {
|
303
|
+
pm_lex_state_t lex_state; // the current state of the lexer
|
304
|
+
int enclosure_nesting; // tracks the current nesting of (), [], and {}
|
305
|
+
|
306
|
+
// Used to temporarily track the nesting of enclosures to determine if a {
|
307
|
+
// is the beginning of a lambda following the parameters of a lambda.
|
308
|
+
int lambda_enclosure_nesting;
|
309
|
+
|
310
|
+
// Used to track the nesting of braces to ensure we get the correct value
|
311
|
+
// when we are interpolating blocks with braces.
|
312
|
+
int brace_nesting;
|
313
|
+
|
314
|
+
// the stack used to determine if a do keyword belongs to the predicate of a
|
315
|
+
// while, until, or for loop
|
316
|
+
pm_state_stack_t do_loop_stack;
|
317
|
+
|
318
|
+
// the stack used to determine if a do keyword belongs to the beginning of a
|
319
|
+
// block
|
320
|
+
pm_state_stack_t accepts_block_stack;
|
321
|
+
|
322
|
+
struct {
|
323
|
+
pm_lex_mode_t *current; // the current mode of the lexer
|
324
|
+
pm_lex_mode_t stack[PM_LEX_STACK_SIZE]; // the stack of lexer modes
|
325
|
+
size_t index; // the current index into the lexer mode stack
|
326
|
+
} lex_modes;
|
327
|
+
|
328
|
+
const uint8_t *start; // the pointer to the start of the source
|
329
|
+
const uint8_t *end; // the pointer to the end of the source
|
330
|
+
pm_token_t previous; // the previous token we were considering
|
331
|
+
pm_token_t current; // the current token we're considering
|
332
|
+
|
333
|
+
// This is a special field set on the parser when we need the parser to jump
|
334
|
+
// to a specific location when lexing the next token, as opposed to just
|
335
|
+
// using the end of the previous token. Normally this is NULL.
|
336
|
+
const uint8_t *next_start;
|
337
|
+
|
338
|
+
// This field indicates the end of a heredoc whose identifier was found on
|
339
|
+
// the current line. If another heredoc is found on the same line, then this
|
340
|
+
// will be moved forward to the end of that heredoc. If no heredocs are
|
341
|
+
// found on a line then this is NULL.
|
342
|
+
const uint8_t *heredoc_end;
|
343
|
+
|
344
|
+
pm_list_t comment_list; // the list of comments that have been found while parsing
|
345
|
+
pm_list_t warning_list; // the list of warnings that have been found while parsing
|
346
|
+
pm_list_t error_list; // the list of errors that have been found while parsing
|
347
|
+
pm_scope_t *current_scope; // the current local scope
|
348
|
+
|
349
|
+
pm_context_node_t *current_context; // the current parsing context
|
350
|
+
|
351
|
+
// The encoding functions for the current file is attached to the parser as
|
352
|
+
// it's parsing so that it can change with a magic comment.
|
353
|
+
pm_encoding_t encoding;
|
354
|
+
|
355
|
+
// When the encoding that is being used to parse the source is changed by
|
356
|
+
// prism, we provide the ability here to call out to a user-defined
|
357
|
+
// function.
|
358
|
+
pm_encoding_changed_callback_t encoding_changed_callback;
|
359
|
+
|
360
|
+
// When an encoding is encountered that isn't understood by prism, we
|
361
|
+
// provide the ability here to call out to a user-defined function to get an
|
362
|
+
// encoding struct. If the function returns something that isn't NULL, we
|
363
|
+
// set that to our encoding and use it to parse identifiers.
|
364
|
+
pm_encoding_decode_callback_t encoding_decode_callback;
|
365
|
+
|
366
|
+
// This pointer indicates where a comment must start if it is to be
|
367
|
+
// considered an encoding comment.
|
368
|
+
const uint8_t *encoding_comment_start;
|
369
|
+
|
370
|
+
// This is an optional callback that can be attached to the parser that will
|
371
|
+
// be called whenever a new token is lexed by the parser.
|
372
|
+
pm_lex_callback_t *lex_callback;
|
373
|
+
|
374
|
+
// This is the path of the file being parsed
|
375
|
+
// We use the filepath when constructing SourceFileNodes
|
376
|
+
pm_string_t filepath_string;
|
377
|
+
|
378
|
+
// This constant pool keeps all of the constants defined throughout the file
|
379
|
+
// so that we can reference them later.
|
380
|
+
pm_constant_pool_t constant_pool;
|
381
|
+
|
382
|
+
// This is the list of newline offsets in the source file.
|
383
|
+
pm_newline_list_t newline_list;
|
384
|
+
|
385
|
+
// We want to add a flag to integer nodes that indicates their base. We only
|
386
|
+
// want to parse these once, but we don't have space on the token itself to
|
387
|
+
// communicate this information. So we store it here and pass it through
|
388
|
+
// when we find tokens that we need it for.
|
389
|
+
pm_node_flags_t integer_base;
|
390
|
+
|
391
|
+
// Whether or not we're at the beginning of a command
|
392
|
+
bool command_start;
|
393
|
+
|
394
|
+
// Whether or not we're currently recovering from a syntax error
|
395
|
+
bool recovering;
|
396
|
+
|
397
|
+
// Whether or not the encoding has been changed by a magic comment. We use
|
398
|
+
// this to provide a fast path for the lexer instead of going through the
|
399
|
+
// function pointer.
|
400
|
+
bool encoding_changed;
|
401
|
+
|
402
|
+
// This flag indicates that we are currently parsing a pattern matching
|
403
|
+
// expression and impacts that calculation of newlines.
|
404
|
+
bool pattern_matching_newlines;
|
405
|
+
|
406
|
+
// This flag indicates that we are currently parsing a keyword argument.
|
407
|
+
bool in_keyword_arg;
|
408
|
+
|
409
|
+
// Whether or not the parser has seen a token that has semantic meaning
|
410
|
+
// (i.e., a token that is not a comment or whitespace).
|
411
|
+
bool semantic_token_seen;
|
412
|
+
|
413
|
+
// Whether or not we have found a frozen_string_literal magic comment with
|
414
|
+
// a true value.
|
415
|
+
bool frozen_string_literal;
|
416
|
+
};
|
417
|
+
|
418
|
+
#endif // PRISM_PARSER_H
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef PRISM_REGEXP_H
|
2
|
+
#define PRISM_REGEXP_H
|
3
|
+
|
4
|
+
#include "prism/defines.h"
|
5
|
+
#include "prism/parser.h"
|
6
|
+
#include "prism/enc/pm_encoding.h"
|
7
|
+
#include "prism/util/pm_memchr.h"
|
8
|
+
#include "prism/util/pm_string_list.h"
|
9
|
+
#include "prism/util/pm_string.h"
|
10
|
+
|
11
|
+
#include <stdbool.h>
|
12
|
+
#include <stddef.h>
|
13
|
+
#include <string.h>
|
14
|
+
|
15
|
+
// Parse a regular expression and extract the names of all of the named capture
|
16
|
+
// groups.
|
17
|
+
PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding);
|
18
|
+
|
19
|
+
#endif
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#ifndef PRISM_UNESCAPE_H
|
2
|
+
#define PRISM_UNESCAPE_H
|
3
|
+
|
4
|
+
#include "prism/defines.h"
|
5
|
+
#include "prism/diagnostic.h"
|
6
|
+
#include "prism/parser.h"
|
7
|
+
#include "prism/util/pm_char.h"
|
8
|
+
#include "prism/util/pm_list.h"
|
9
|
+
#include "prism/util/pm_memchr.h"
|
10
|
+
#include "prism/util/pm_string.h"
|
11
|
+
|
12
|
+
#include <assert.h>
|
13
|
+
#include <stdbool.h>
|
14
|
+
#include <stdint.h>
|
15
|
+
#include <string.h>
|
16
|
+
|
17
|
+
// The type of unescape we are performing.
|
18
|
+
typedef enum {
|
19
|
+
// When we're creating a string inside of a list literal like %w, we
|
20
|
+
// shouldn't escape anything.
|
21
|
+
PM_UNESCAPE_NONE,
|
22
|
+
|
23
|
+
// When we're unescaping a single-quoted string, we only need to unescape
|
24
|
+
// single quotes and backslashes.
|
25
|
+
PM_UNESCAPE_MINIMAL,
|
26
|
+
|
27
|
+
// When we're unescaping a string list, in addition to MINIMAL, we need to
|
28
|
+
// unescape whitespace.
|
29
|
+
PM_UNESCAPE_WHITESPACE,
|
30
|
+
|
31
|
+
// When we're unescaping a double-quoted string, we need to unescape all
|
32
|
+
// escapes.
|
33
|
+
PM_UNESCAPE_ALL,
|
34
|
+
} pm_unescape_type_t;
|
35
|
+
|
36
|
+
// Unescape the contents of the given token into the given string using the given unescape mode.
|
37
|
+
PRISM_EXPORTED_FUNCTION void pm_unescape_manipulate_string(pm_parser_t *parser, pm_string_t *string, pm_unescape_type_t unescape_type);
|
38
|
+
void pm_unescape_manipulate_char_literal(pm_parser_t *parser, pm_string_t *string, pm_unescape_type_t unescape_type);
|
39
|
+
|
40
|
+
// Accepts a source string and a type of unescaping and returns the unescaped version.
|
41
|
+
// The caller must pm_string_free(result); after calling this function.
|
42
|
+
PRISM_EXPORTED_FUNCTION bool pm_unescape_string(const uint8_t *start, size_t length, pm_unescape_type_t unescape_type, pm_string_t *result);
|
43
|
+
|
44
|
+
// Returns the number of bytes that encompass the first escape sequence in the
|
45
|
+
// given string.
|
46
|
+
size_t pm_unescape_calculate_difference(pm_parser_t *parser, const uint8_t *value, pm_unescape_type_t unescape_type, bool expect_single_codepoint);
|
47
|
+
|
48
|
+
#endif
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#ifndef PRISM_BUFFER_H
|
2
|
+
#define PRISM_BUFFER_H
|
3
|
+
|
4
|
+
#include "prism/defines.h"
|
5
|
+
|
6
|
+
#include <assert.h>
|
7
|
+
#include <stdbool.h>
|
8
|
+
#include <stdint.h>
|
9
|
+
#include <stdlib.h>
|
10
|
+
#include <string.h>
|
11
|
+
|
12
|
+
// A pm_buffer_t is a simple memory buffer that stores data in a contiguous
|
13
|
+
// block of memory. It is used to store the serialized representation of a
|
14
|
+
// prism tree.
|
15
|
+
typedef struct {
|
16
|
+
char *value;
|
17
|
+
size_t length;
|
18
|
+
size_t capacity;
|
19
|
+
} pm_buffer_t;
|
20
|
+
|
21
|
+
// Return the size of the pm_buffer_t struct.
|
22
|
+
PRISM_EXPORTED_FUNCTION size_t pm_buffer_sizeof(void);
|
23
|
+
|
24
|
+
// Initialize a pm_buffer_t with its default values.
|
25
|
+
PRISM_EXPORTED_FUNCTION bool pm_buffer_init(pm_buffer_t *buffer);
|
26
|
+
|
27
|
+
// Return the value of the buffer.
|
28
|
+
PRISM_EXPORTED_FUNCTION char * pm_buffer_value(pm_buffer_t *buffer);
|
29
|
+
|
30
|
+
// Return the length of the buffer.
|
31
|
+
PRISM_EXPORTED_FUNCTION size_t pm_buffer_length(pm_buffer_t *buffer);
|
32
|
+
|
33
|
+
// Append the given amount of space as zeroes to the buffer.
|
34
|
+
void pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length);
|
35
|
+
|
36
|
+
// Append a string to the buffer.
|
37
|
+
void pm_buffer_append_str(pm_buffer_t *buffer, const char *value, size_t length);
|
38
|
+
|
39
|
+
// Append a list of bytes to the buffer.
|
40
|
+
void pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length);
|
41
|
+
|
42
|
+
// Append a single byte to the buffer.
|
43
|
+
void pm_buffer_append_u8(pm_buffer_t *buffer, uint8_t value);
|
44
|
+
|
45
|
+
// Append a 32-bit unsigned integer to the buffer.
|
46
|
+
void pm_buffer_append_u32(pm_buffer_t *buffer, uint32_t value);
|
47
|
+
|
48
|
+
// Free the memory associated with the buffer.
|
49
|
+
PRISM_EXPORTED_FUNCTION void pm_buffer_free(pm_buffer_t *buffer);
|
50
|
+
|
51
|
+
#endif
|
@@ -0,0 +1,91 @@
|
|
1
|
+
#ifndef PRISM_CHAR_H
|
2
|
+
#define PRISM_CHAR_H
|
3
|
+
|
4
|
+
#include "prism/defines.h"
|
5
|
+
#include "prism/util/pm_newline_list.h"
|
6
|
+
|
7
|
+
#include <stdbool.h>
|
8
|
+
#include <stddef.h>
|
9
|
+
|
10
|
+
// Returns the number of characters at the start of the string that are
|
11
|
+
// whitespace. Disallows searching past the given maximum number of characters.
|
12
|
+
size_t pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length);
|
13
|
+
|
14
|
+
// Returns the number of characters at the start of the string that are
|
15
|
+
// whitespace while also tracking the location of each newline. Disallows
|
16
|
+
// searching past the given maximum number of characters.
|
17
|
+
size_t
|
18
|
+
pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list);
|
19
|
+
|
20
|
+
// Returns the number of characters at the start of the string that are inline
|
21
|
+
// whitespace. Disallows searching past the given maximum number of characters.
|
22
|
+
size_t pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length);
|
23
|
+
|
24
|
+
// Returns the number of characters at the start of the string that are decimal
|
25
|
+
// digits. Disallows searching past the given maximum number of characters.
|
26
|
+
size_t pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length);
|
27
|
+
|
28
|
+
// Returns the number of characters at the start of the string that are
|
29
|
+
// hexadecimal digits. Disallows searching past the given maximum number of
|
30
|
+
// characters.
|
31
|
+
size_t pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length);
|
32
|
+
|
33
|
+
// Returns the number of characters at the start of the string that are octal
|
34
|
+
// digits or underscores. Disallows searching past the given maximum number of
|
35
|
+
// characters.
|
36
|
+
//
|
37
|
+
// If multiple underscores are found in a row or if an underscore is
|
38
|
+
// found at the end of the number, then the invalid pointer is set to the index
|
39
|
+
// of the first invalid underscore.
|
40
|
+
size_t pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
|
41
|
+
|
42
|
+
// Returns the number of characters at the start of the string that are decimal
|
43
|
+
// digits or underscores. Disallows searching past the given maximum number of
|
44
|
+
// characters.
|
45
|
+
//
|
46
|
+
// If multiple underscores are found in a row or if an underscore is
|
47
|
+
// found at the end of the number, then the invalid pointer is set to the index
|
48
|
+
// of the first invalid underscore.
|
49
|
+
size_t pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
|
50
|
+
|
51
|
+
// Returns the number of characters at the start of the string that are
|
52
|
+
// hexadecimal digits or underscores. Disallows searching past the given maximum
|
53
|
+
// number of characters.
|
54
|
+
//
|
55
|
+
// If multiple underscores are found in a row or if an underscore is
|
56
|
+
// found at the end of the number, then the invalid pointer is set to the index
|
57
|
+
// of the first invalid underscore.
|
58
|
+
size_t pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
|
59
|
+
|
60
|
+
// Returns the number of characters at the start of the string that are regexp
|
61
|
+
// options. Disallows searching past the given maximum number of characters.
|
62
|
+
size_t pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length);
|
63
|
+
|
64
|
+
// Returns the number of characters at the start of the string that are binary
|
65
|
+
// digits or underscores. Disallows searching past the given maximum number of
|
66
|
+
// characters.
|
67
|
+
//
|
68
|
+
// If multiple underscores are found in a row or if an underscore is
|
69
|
+
// found at the end of the number, then the invalid pointer is set to the index
|
70
|
+
// of the first invalid underscore.
|
71
|
+
size_t pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
|
72
|
+
|
73
|
+
// Returns true if the given character is a whitespace character.
|
74
|
+
bool pm_char_is_whitespace(const uint8_t b);
|
75
|
+
|
76
|
+
// Returns true if the given character is an inline whitespace character.
|
77
|
+
bool pm_char_is_inline_whitespace(const uint8_t b);
|
78
|
+
|
79
|
+
// Returns true if the given character is a binary digit.
|
80
|
+
bool pm_char_is_binary_digit(const uint8_t b);
|
81
|
+
|
82
|
+
// Returns true if the given character is an octal digit.
|
83
|
+
bool pm_char_is_octal_digit(const uint8_t b);
|
84
|
+
|
85
|
+
// Returns true if the given character is a decimal digit.
|
86
|
+
bool pm_char_is_decimal_digit(const uint8_t b);
|
87
|
+
|
88
|
+
// Returns true if the given character is a hexadecimal digit.
|
89
|
+
bool pm_char_is_hexadecimal_digit(const uint8_t b);
|
90
|
+
|
91
|
+
#endif
|