yarp 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +51 -0
- data/LICENSE.md +7 -0
- data/Makefile.in +79 -0
- data/README.md +86 -0
- data/config.h.in +25 -0
- data/config.yml +2147 -0
- data/configure +4487 -0
- data/docs/build_system.md +85 -0
- data/docs/building.md +26 -0
- data/docs/configuration.md +56 -0
- data/docs/design.md +53 -0
- data/docs/encoding.md +116 -0
- data/docs/extension.md +20 -0
- data/docs/fuzzing.md +93 -0
- data/docs/heredocs.md +36 -0
- data/docs/mapping.md +117 -0
- data/docs/ripper.md +36 -0
- data/docs/serialization.md +130 -0
- data/docs/testing.md +55 -0
- data/ext/yarp/api_node.c +3680 -0
- data/ext/yarp/api_pack.c +256 -0
- data/ext/yarp/extconf.rb +131 -0
- data/ext/yarp/extension.c +547 -0
- data/ext/yarp/extension.h +18 -0
- data/include/yarp/ast.h +1412 -0
- data/include/yarp/defines.h +54 -0
- data/include/yarp/diagnostic.h +24 -0
- data/include/yarp/enc/yp_encoding.h +94 -0
- data/include/yarp/node.h +36 -0
- data/include/yarp/pack.h +141 -0
- data/include/yarp/parser.h +389 -0
- data/include/yarp/regexp.h +19 -0
- data/include/yarp/unescape.h +42 -0
- data/include/yarp/util/yp_buffer.h +39 -0
- data/include/yarp/util/yp_char.h +75 -0
- data/include/yarp/util/yp_constant_pool.h +64 -0
- data/include/yarp/util/yp_list.h +67 -0
- data/include/yarp/util/yp_memchr.h +14 -0
- data/include/yarp/util/yp_newline_list.h +54 -0
- data/include/yarp/util/yp_state_stack.h +24 -0
- data/include/yarp/util/yp_string.h +57 -0
- data/include/yarp/util/yp_string_list.h +28 -0
- data/include/yarp/util/yp_strpbrk.h +29 -0
- data/include/yarp/version.h +5 -0
- data/include/yarp.h +69 -0
- data/lib/yarp/lex_compat.rb +759 -0
- data/lib/yarp/node.rb +7428 -0
- data/lib/yarp/pack.rb +185 -0
- data/lib/yarp/ripper_compat.rb +174 -0
- data/lib/yarp/serialize.rb +389 -0
- data/lib/yarp.rb +330 -0
- data/src/diagnostic.c +25 -0
- data/src/enc/yp_big5.c +79 -0
- data/src/enc/yp_euc_jp.c +85 -0
- data/src/enc/yp_gbk.c +88 -0
- data/src/enc/yp_shift_jis.c +83 -0
- data/src/enc/yp_tables.c +509 -0
- data/src/enc/yp_unicode.c +2320 -0
- data/src/enc/yp_windows_31j.c +83 -0
- data/src/node.c +2011 -0
- data/src/pack.c +493 -0
- data/src/prettyprint.c +1782 -0
- data/src/regexp.c +580 -0
- data/src/serialize.c +1576 -0
- data/src/token_type.c +347 -0
- data/src/unescape.c +576 -0
- data/src/util/yp_buffer.c +78 -0
- data/src/util/yp_char.c +229 -0
- data/src/util/yp_constant_pool.c +147 -0
- data/src/util/yp_list.c +50 -0
- data/src/util/yp_memchr.c +31 -0
- data/src/util/yp_newline_list.c +119 -0
- data/src/util/yp_state_stack.c +25 -0
- data/src/util/yp_string.c +207 -0
- data/src/util/yp_string_list.c +32 -0
- data/src/util/yp_strncasecmp.c +20 -0
- data/src/util/yp_strpbrk.c +66 -0
- data/src/yarp.c +13211 -0
- data/yarp.gemspec +100 -0
- metadata +125 -0
@@ -0,0 +1,389 @@
|
|
1
|
+
#ifndef YARP_PARSER_H
|
2
|
+
#define YARP_PARSER_H
|
3
|
+
|
4
|
+
#include "yarp/ast.h"
|
5
|
+
#include "yarp/defines.h"
|
6
|
+
#include "yarp/enc/yp_encoding.h"
|
7
|
+
#include "yarp/util/yp_constant_pool.h"
|
8
|
+
#include "yarp/util/yp_list.h"
|
9
|
+
#include "yarp/util/yp_newline_list.h"
|
10
|
+
#include "yarp/util/yp_state_stack.h"
|
11
|
+
|
12
|
+
#include <stdbool.h>
|
13
|
+
|
14
|
+
// This enum provides various bits that represent different kinds of states that
|
15
|
+
// the lexer can track. This is used to determine which kind of token to return
|
16
|
+
// based on the context of the parser.
|
17
|
+
typedef enum {
|
18
|
+
YP_LEX_STATE_BIT_BEG,
|
19
|
+
YP_LEX_STATE_BIT_END,
|
20
|
+
YP_LEX_STATE_BIT_ENDARG,
|
21
|
+
YP_LEX_STATE_BIT_ENDFN,
|
22
|
+
YP_LEX_STATE_BIT_ARG,
|
23
|
+
YP_LEX_STATE_BIT_CMDARG,
|
24
|
+
YP_LEX_STATE_BIT_MID,
|
25
|
+
YP_LEX_STATE_BIT_FNAME,
|
26
|
+
YP_LEX_STATE_BIT_DOT,
|
27
|
+
YP_LEX_STATE_BIT_CLASS,
|
28
|
+
YP_LEX_STATE_BIT_LABEL,
|
29
|
+
YP_LEX_STATE_BIT_LABELED,
|
30
|
+
YP_LEX_STATE_BIT_FITEM
|
31
|
+
} yp_lex_state_bit_t;
|
32
|
+
|
33
|
+
// This enum combines the various bits from the above enum into individual
|
34
|
+
// values that represent the various states of the lexer.
|
35
|
+
typedef enum {
|
36
|
+
YP_LEX_STATE_NONE = 0,
|
37
|
+
YP_LEX_STATE_BEG = (1 << YP_LEX_STATE_BIT_BEG),
|
38
|
+
YP_LEX_STATE_END = (1 << YP_LEX_STATE_BIT_END),
|
39
|
+
YP_LEX_STATE_ENDARG = (1 << YP_LEX_STATE_BIT_ENDARG),
|
40
|
+
YP_LEX_STATE_ENDFN = (1 << YP_LEX_STATE_BIT_ENDFN),
|
41
|
+
YP_LEX_STATE_ARG = (1 << YP_LEX_STATE_BIT_ARG),
|
42
|
+
YP_LEX_STATE_CMDARG = (1 << YP_LEX_STATE_BIT_CMDARG),
|
43
|
+
YP_LEX_STATE_MID = (1 << YP_LEX_STATE_BIT_MID),
|
44
|
+
YP_LEX_STATE_FNAME = (1 << YP_LEX_STATE_BIT_FNAME),
|
45
|
+
YP_LEX_STATE_DOT = (1 << YP_LEX_STATE_BIT_DOT),
|
46
|
+
YP_LEX_STATE_CLASS = (1 << YP_LEX_STATE_BIT_CLASS),
|
47
|
+
YP_LEX_STATE_LABEL = (1 << YP_LEX_STATE_BIT_LABEL),
|
48
|
+
YP_LEX_STATE_LABELED = (1 << YP_LEX_STATE_BIT_LABELED),
|
49
|
+
YP_LEX_STATE_FITEM = (1 << YP_LEX_STATE_BIT_FITEM),
|
50
|
+
YP_LEX_STATE_BEG_ANY = YP_LEX_STATE_BEG | YP_LEX_STATE_MID | YP_LEX_STATE_CLASS,
|
51
|
+
YP_LEX_STATE_ARG_ANY = YP_LEX_STATE_ARG | YP_LEX_STATE_CMDARG,
|
52
|
+
YP_LEX_STATE_END_ANY = YP_LEX_STATE_END | YP_LEX_STATE_ENDARG | YP_LEX_STATE_ENDFN
|
53
|
+
} yp_lex_state_t;
|
54
|
+
|
55
|
+
typedef enum {
|
56
|
+
YP_HEREDOC_QUOTE_NONE,
|
57
|
+
YP_HEREDOC_QUOTE_SINGLE = '\'',
|
58
|
+
YP_HEREDOC_QUOTE_DOUBLE = '"',
|
59
|
+
YP_HEREDOC_QUOTE_BACKTICK = '`',
|
60
|
+
} yp_heredoc_quote_t;
|
61
|
+
|
62
|
+
typedef enum {
|
63
|
+
YP_HEREDOC_INDENT_NONE,
|
64
|
+
YP_HEREDOC_INDENT_DASH,
|
65
|
+
YP_HEREDOC_INDENT_TILDE,
|
66
|
+
} yp_heredoc_indent_t;
|
67
|
+
|
68
|
+
// When lexing Ruby source, the lexer has a small amount of state to tell which
|
69
|
+
// kind of token it is currently lexing. For example, when we find the start of
|
70
|
+
// a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
|
71
|
+
// that the lexer is now in the YP_LEX_STRING mode, and will return tokens that
|
72
|
+
// are found as part of a string.
|
73
|
+
typedef struct yp_lex_mode {
|
74
|
+
enum {
|
75
|
+
// This state is used when any given token is being lexed.
|
76
|
+
YP_LEX_DEFAULT,
|
77
|
+
|
78
|
+
// This state is used when we're lexing as normal but inside an embedded
|
79
|
+
// expression of a string.
|
80
|
+
YP_LEX_EMBEXPR,
|
81
|
+
|
82
|
+
// This state is used when we're lexing a variable that is embedded
|
83
|
+
// directly inside of a string with the # shorthand.
|
84
|
+
YP_LEX_EMBVAR,
|
85
|
+
|
86
|
+
// This state is used when you are inside the content of a heredoc.
|
87
|
+
YP_LEX_HEREDOC,
|
88
|
+
|
89
|
+
// This state is used when we are lexing a list of tokens, as in a %w
|
90
|
+
// word list literal or a %i symbol list literal.
|
91
|
+
YP_LEX_LIST,
|
92
|
+
|
93
|
+
// This state is used when a regular expression has been begun and we
|
94
|
+
// are looking for the terminator.
|
95
|
+
YP_LEX_REGEXP,
|
96
|
+
|
97
|
+
// This state is used when we are lexing a string or a string-like
|
98
|
+
// token, as in string content with either quote or an xstring.
|
99
|
+
YP_LEX_STRING
|
100
|
+
} mode;
|
101
|
+
|
102
|
+
union {
|
103
|
+
struct {
|
104
|
+
// This keeps track of the nesting level of the list.
|
105
|
+
size_t nesting;
|
106
|
+
|
107
|
+
// Whether or not interpolation is allowed in this list.
|
108
|
+
bool interpolation;
|
109
|
+
|
110
|
+
// When lexing a list, it takes into account balancing the
|
111
|
+
// terminator if the terminator is one of (), [], {}, or <>.
|
112
|
+
char incrementor;
|
113
|
+
|
114
|
+
// This is the terminator of the list literal.
|
115
|
+
char terminator;
|
116
|
+
|
117
|
+
// This is the character set that should be used to delimit the
|
118
|
+
// tokens within the list.
|
119
|
+
char breakpoints[11];
|
120
|
+
} list;
|
121
|
+
|
122
|
+
struct {
|
123
|
+
// This keeps track of the nesting level of the regular expression.
|
124
|
+
size_t nesting;
|
125
|
+
|
126
|
+
// When lexing a regular expression, it takes into account balancing
|
127
|
+
// the terminator if the terminator is one of (), [], {}, or <>.
|
128
|
+
char incrementor;
|
129
|
+
|
130
|
+
// This is the terminator of the regular expression.
|
131
|
+
char terminator;
|
132
|
+
|
133
|
+
// This is the character set that should be used to delimit the
|
134
|
+
// tokens within the regular expression.
|
135
|
+
char breakpoints[6];
|
136
|
+
} regexp;
|
137
|
+
|
138
|
+
struct {
|
139
|
+
// This keeps track of the nesting level of the string.
|
140
|
+
size_t nesting;
|
141
|
+
|
142
|
+
// Whether or not interpolation is allowed in this string.
|
143
|
+
bool interpolation;
|
144
|
+
|
145
|
+
// Whether or not at the end of the string we should allow a :,
|
146
|
+
// which would indicate this was a dynamic symbol instead of a
|
147
|
+
// string.
|
148
|
+
bool label_allowed;
|
149
|
+
|
150
|
+
// When lexing a string, it takes into account balancing the
|
151
|
+
// terminator if the terminator is one of (), [], {}, or <>.
|
152
|
+
char incrementor;
|
153
|
+
|
154
|
+
// This is the terminator of the string. It is typically either a
|
155
|
+
// single or double quote.
|
156
|
+
char terminator;
|
157
|
+
|
158
|
+
// This is the character set that should be used to delimit the
|
159
|
+
// tokens within the string.
|
160
|
+
char breakpoints[6];
|
161
|
+
} string;
|
162
|
+
|
163
|
+
struct {
|
164
|
+
// These pointers point to the beginning and end of the heredoc
|
165
|
+
// identifier.
|
166
|
+
const char *ident_start;
|
167
|
+
size_t ident_length;
|
168
|
+
|
169
|
+
yp_heredoc_quote_t quote;
|
170
|
+
yp_heredoc_indent_t indent;
|
171
|
+
|
172
|
+
// This is the pointer to the character where lexing should resume
|
173
|
+
// once the heredoc has been completely processed.
|
174
|
+
const char *next_start;
|
175
|
+
} heredoc;
|
176
|
+
} as;
|
177
|
+
|
178
|
+
// The previous lex state so that it knows how to pop.
|
179
|
+
struct yp_lex_mode *prev;
|
180
|
+
} yp_lex_mode_t;
|
181
|
+
|
182
|
+
// We pre-allocate a certain number of lex states in order to avoid having to
|
183
|
+
// call malloc too many times while parsing. You really shouldn't need more than
|
184
|
+
// this because you only really nest deeply when doing string interpolation.
|
185
|
+
#define YP_LEX_STACK_SIZE 4
|
186
|
+
|
187
|
+
// A forward declaration since our error handler struct accepts a parser for
|
188
|
+
// each of its function calls.
|
189
|
+
typedef struct yp_parser yp_parser_t;
|
190
|
+
|
191
|
+
// While parsing, we keep track of a stack of contexts. This is helpful for
|
192
|
+
// error recovery so that we can pop back to a previous context when we hit a
|
193
|
+
// token that is understood by a parent context but not by the current context.
|
194
|
+
typedef enum {
|
195
|
+
YP_CONTEXT_BEGIN, // a begin statement
|
196
|
+
YP_CONTEXT_BLOCK_BRACES, // expressions in block arguments using braces
|
197
|
+
YP_CONTEXT_BLOCK_KEYWORDS, // expressions in block arguments using do..end
|
198
|
+
YP_CONTEXT_CASE_WHEN, // a case when statements
|
199
|
+
YP_CONTEXT_CASE_IN, // a case in statements
|
200
|
+
YP_CONTEXT_CLASS, // a class declaration
|
201
|
+
YP_CONTEXT_DEF, // a method definition
|
202
|
+
YP_CONTEXT_DEF_PARAMS, // a method definition's parameters
|
203
|
+
YP_CONTEXT_DEFAULT_PARAMS, // a method definition's default parameter
|
204
|
+
YP_CONTEXT_ELSE, // an else clause
|
205
|
+
YP_CONTEXT_ELSIF, // an elsif clause
|
206
|
+
YP_CONTEXT_EMBEXPR, // an interpolated expression
|
207
|
+
YP_CONTEXT_ENSURE, // an ensure statement
|
208
|
+
YP_CONTEXT_FOR, // a for loop
|
209
|
+
YP_CONTEXT_IF, // an if statement
|
210
|
+
YP_CONTEXT_LAMBDA_BRACES, // a lambda expression with braces
|
211
|
+
YP_CONTEXT_LAMBDA_DO_END, // a lambda expression with do..end
|
212
|
+
YP_CONTEXT_MAIN, // the top level context
|
213
|
+
YP_CONTEXT_MODULE, // a module declaration
|
214
|
+
YP_CONTEXT_PARENS, // a parenthesized expression
|
215
|
+
YP_CONTEXT_POSTEXE, // an END block
|
216
|
+
YP_CONTEXT_PREDICATE, // a predicate inside an if/elsif/unless statement
|
217
|
+
YP_CONTEXT_PREEXE, // a BEGIN block
|
218
|
+
YP_CONTEXT_RESCUE_ELSE, // a rescue else statement
|
219
|
+
YP_CONTEXT_RESCUE, // a rescue statement
|
220
|
+
YP_CONTEXT_SCLASS, // a singleton class definition
|
221
|
+
YP_CONTEXT_UNLESS, // an unless statement
|
222
|
+
YP_CONTEXT_UNTIL, // an until statement
|
223
|
+
YP_CONTEXT_WHILE, // a while statement
|
224
|
+
} yp_context_t;
|
225
|
+
|
226
|
+
// This is a node in a linked list of contexts.
|
227
|
+
typedef struct yp_context_node {
|
228
|
+
yp_context_t context;
|
229
|
+
struct yp_context_node *prev;
|
230
|
+
} yp_context_node_t;
|
231
|
+
|
232
|
+
// This is the type of a comment that we've found while parsing.
|
233
|
+
typedef enum {
|
234
|
+
YP_COMMENT_INLINE,
|
235
|
+
YP_COMMENT_EMBDOC,
|
236
|
+
YP_COMMENT___END__
|
237
|
+
} yp_comment_type_t;
|
238
|
+
|
239
|
+
// This is a node in the linked list of comments that we've found while parsing.
|
240
|
+
typedef struct yp_comment {
|
241
|
+
yp_list_node_t node;
|
242
|
+
const char *start;
|
243
|
+
const char *end;
|
244
|
+
yp_comment_type_t type;
|
245
|
+
} yp_comment_t;
|
246
|
+
|
247
|
+
// When the encoding that is being used to parse the source is changed by YARP,
|
248
|
+
// we provide the ability here to call out to a user-defined function.
|
249
|
+
typedef void (*yp_encoding_changed_callback_t)(yp_parser_t *parser);
|
250
|
+
|
251
|
+
// When an encoding is encountered that isn't understood by YARP, we provide
|
252
|
+
// the ability here to call out to a user-defined function to get an encoding
|
253
|
+
// struct. If the function returns something that isn't NULL, we set that to
|
254
|
+
// our encoding and use it to parse identifiers.
|
255
|
+
typedef yp_encoding_t *(*yp_encoding_decode_callback_t)(yp_parser_t *parser, const char *name, size_t width);
|
256
|
+
|
257
|
+
// When you are lexing through a file, the lexer needs all of the information
|
258
|
+
// that the parser additionally provides (for example, the local table). So if
|
259
|
+
// you want to properly lex Ruby, you need to actually lex it in the context of
|
260
|
+
// the parser. In order to provide this functionality, we optionally allow a
|
261
|
+
// struct to be attached to the parser that calls back out to a user-provided
|
262
|
+
// callback when each token is lexed.
|
263
|
+
typedef struct {
|
264
|
+
// This opaque pointer is used to provide whatever information the user
|
265
|
+
// deemed necessary to the callback. In our case we use it to pass the array
|
266
|
+
// that the tokens get appended into.
|
267
|
+
void *data;
|
268
|
+
|
269
|
+
// This is the callback that is called when a token is lexed. It is passed
|
270
|
+
// the opaque data pointer, the parser, and the token that was lexed.
|
271
|
+
void (*callback)(void *data, yp_parser_t *parser, yp_token_t *token);
|
272
|
+
} yp_lex_callback_t;
|
273
|
+
|
274
|
+
// This struct represents a node in a linked list of scopes. Some scopes can see
|
275
|
+
// into their parent scopes, while others cannot.
|
276
|
+
typedef struct yp_scope {
|
277
|
+
// The IDs of the locals in the given scope.
|
278
|
+
yp_constant_id_list_t locals;
|
279
|
+
|
280
|
+
// A boolean indicating whether or not this scope can see into its parent.
|
281
|
+
// If closed is true, then the scope cannot see into its parent.
|
282
|
+
bool closed;
|
283
|
+
|
284
|
+
// A pointer to the previous scope in the linked list.
|
285
|
+
struct yp_scope *previous;
|
286
|
+
} yp_scope_t;
|
287
|
+
|
288
|
+
// This struct represents the overall parser. It contains a reference to the
|
289
|
+
// source file, as well as pointers that indicate where in the source it's
|
290
|
+
// currently parsing. It also contains the most recent and current token that
|
291
|
+
// it's considering.
|
292
|
+
struct yp_parser {
|
293
|
+
yp_lex_state_t lex_state; // the current state of the lexer
|
294
|
+
bool command_start; // whether or not we're at the beginning of a command
|
295
|
+
int enclosure_nesting; // tracks the current nesting of (), [], and {}
|
296
|
+
|
297
|
+
// Used to temporarily track the nesting of enclosures to determine if a {
|
298
|
+
// is the beginning of a lambda following the parameters of a lambda.
|
299
|
+
int lambda_enclosure_nesting;
|
300
|
+
|
301
|
+
// Used to track the nesting of braces to ensure we get the correct value
|
302
|
+
// when we are interpolating blocks with braces.
|
303
|
+
int brace_nesting;
|
304
|
+
|
305
|
+
// the stack used to determine if a do keyword belongs to the predicate of a
|
306
|
+
// while, until, or for loop
|
307
|
+
yp_state_stack_t do_loop_stack;
|
308
|
+
|
309
|
+
// the stack used to determine if a do keyword belongs to the beginning of a
|
310
|
+
// block
|
311
|
+
yp_state_stack_t accepts_block_stack;
|
312
|
+
|
313
|
+
struct {
|
314
|
+
yp_lex_mode_t *current; // the current mode of the lexer
|
315
|
+
yp_lex_mode_t stack[YP_LEX_STACK_SIZE]; // the stack of lexer modes
|
316
|
+
size_t index; // the current index into the lexer mode stack
|
317
|
+
} lex_modes;
|
318
|
+
|
319
|
+
const char *start; // the pointer to the start of the source
|
320
|
+
const char *end; // the pointer to the end of the source
|
321
|
+
yp_token_t previous; // the previous token we were considering
|
322
|
+
yp_token_t current; // the current token we're considering
|
323
|
+
|
324
|
+
// This is a special field set on the parser when we need the parser to jump
|
325
|
+
// to a specific location when lexing the next token, as opposed to just
|
326
|
+
// using the end of the previous token. Normally this is NULL.
|
327
|
+
const char *next_start;
|
328
|
+
|
329
|
+
// This field indicates the end of a heredoc whose identifier was found on
|
330
|
+
// the current line. If another heredoc is found on the same line, then this
|
331
|
+
// will be moved forward to the end of that heredoc. If no heredocs are
|
332
|
+
// found on a line then this is NULL.
|
333
|
+
const char *heredoc_end;
|
334
|
+
|
335
|
+
yp_list_t comment_list; // the list of comments that have been found while parsing
|
336
|
+
yp_list_t warning_list; // the list of warnings that have been found while parsing
|
337
|
+
yp_list_t error_list; // the list of errors that have been found while parsing
|
338
|
+
yp_scope_t *current_scope; // the current local scope
|
339
|
+
|
340
|
+
yp_context_node_t *current_context; // the current parsing context
|
341
|
+
bool recovering; // whether or not we're currently recovering from a syntax error
|
342
|
+
|
343
|
+
// The encoding functions for the current file is attached to the parser as
|
344
|
+
// it's parsing so that it can change with a magic comment.
|
345
|
+
yp_encoding_t encoding;
|
346
|
+
|
347
|
+
// Whether or not the encoding has been changed by a magic comment. We use
|
348
|
+
// this to provide a fast path for the lexer instead of going through the
|
349
|
+
// function pointer.
|
350
|
+
bool encoding_changed;
|
351
|
+
|
352
|
+
// When the encoding that is being used to parse the source is changed by
|
353
|
+
// YARP, we provide the ability here to call out to a user-defined function.
|
354
|
+
yp_encoding_changed_callback_t encoding_changed_callback;
|
355
|
+
|
356
|
+
// When an encoding is encountered that isn't understood by YARP, we provide
|
357
|
+
// the ability here to call out to a user-defined function to get an
|
358
|
+
// encoding struct. If the function returns something that isn't NULL, we
|
359
|
+
// set that to our encoding and use it to parse identifiers.
|
360
|
+
yp_encoding_decode_callback_t encoding_decode_callback;
|
361
|
+
|
362
|
+
// This pointer indicates where a comment must start if it is to be
|
363
|
+
// considered an encoding comment.
|
364
|
+
const char *encoding_comment_start;
|
365
|
+
|
366
|
+
// This is an optional callback that can be attached to the parser that will
|
367
|
+
// be called whenever a new token is lexed by the parser.
|
368
|
+
yp_lex_callback_t *lex_callback;
|
369
|
+
|
370
|
+
// This flag indicates that we are currently parsing a pattern matching
|
371
|
+
// expression and impacts that calculation of newlines.
|
372
|
+
bool pattern_matching_newlines;
|
373
|
+
|
374
|
+
// This flag indicates that we are currently parsing a keyword argument.
|
375
|
+
bool in_keyword_arg;
|
376
|
+
|
377
|
+
// This is the path of the file being parsed
|
378
|
+
// We use the filepath when constructing SourceFileNodes
|
379
|
+
yp_string_t filepath_string;
|
380
|
+
|
381
|
+
// This constant pool keeps all of the constants defined throughout the file
|
382
|
+
// so that we can reference them later.
|
383
|
+
yp_constant_pool_t constant_pool;
|
384
|
+
|
385
|
+
// This is the list of newline offsets in the source file.
|
386
|
+
yp_newline_list_t newline_list;
|
387
|
+
};
|
388
|
+
|
389
|
+
#endif // YARP_PARSER_H
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef YARP_REGEXP_H
|
2
|
+
#define YARP_REGEXP_H
|
3
|
+
|
4
|
+
#include "yarp/defines.h"
|
5
|
+
#include "yarp/parser.h"
|
6
|
+
#include "yarp/enc/yp_encoding.h"
|
7
|
+
#include "yarp/util/yp_memchr.h"
|
8
|
+
#include "yarp/util/yp_string_list.h"
|
9
|
+
#include "yarp/util/yp_string.h"
|
10
|
+
|
11
|
+
#include <stdbool.h>
|
12
|
+
#include <stddef.h>
|
13
|
+
#include <string.h>
|
14
|
+
|
15
|
+
// Parse a regular expression and extract the names of all of the named capture
|
16
|
+
// groups.
|
17
|
+
YP_EXPORTED_FUNCTION bool yp_regexp_named_capture_group_names(const char *source, size_t size, yp_string_list_t *named_captures, bool encoding_changed, yp_encoding_t *encoding);
|
18
|
+
|
19
|
+
#endif
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#ifndef YARP_UNESCAPE_H
|
2
|
+
#define YARP_UNESCAPE_H
|
3
|
+
|
4
|
+
#include "yarp/defines.h"
|
5
|
+
#include "yarp/diagnostic.h"
|
6
|
+
#include "yarp/parser.h"
|
7
|
+
#include "yarp/util/yp_char.h"
|
8
|
+
#include "yarp/util/yp_list.h"
|
9
|
+
#include "yarp/util/yp_memchr.h"
|
10
|
+
#include "yarp/util/yp_string.h"
|
11
|
+
|
12
|
+
#include <assert.h>
|
13
|
+
#include <stdbool.h>
|
14
|
+
#include <stdint.h>
|
15
|
+
#include <string.h>
|
16
|
+
|
17
|
+
// The type of unescape we are performing.
|
18
|
+
typedef enum {
|
19
|
+
// When we're creating a string inside of a list literal like %w, we
|
20
|
+
// shouldn't escape anything.
|
21
|
+
YP_UNESCAPE_NONE,
|
22
|
+
|
23
|
+
// When we're unescaping a single-quoted string, we only need to unescape
|
24
|
+
// single quotes and backslashes.
|
25
|
+
YP_UNESCAPE_MINIMAL,
|
26
|
+
|
27
|
+
// When we're unescaping a double-quoted string, we need to unescape all
|
28
|
+
// escapes.
|
29
|
+
YP_UNESCAPE_ALL
|
30
|
+
} yp_unescape_type_t;
|
31
|
+
|
32
|
+
// Unescape the contents of the given token into the given string using the
|
33
|
+
// given unescape mode.
|
34
|
+
YP_EXPORTED_FUNCTION void yp_unescape_manipulate_string(yp_parser_t *parser, const char *value, size_t length, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list);
|
35
|
+
|
36
|
+
// Accepts a source string and a type of unescaping and returns the unescaped version.
|
37
|
+
// The caller must yp_string_free(result); after calling this function.
|
38
|
+
YP_EXPORTED_FUNCTION bool yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result);
|
39
|
+
|
40
|
+
YP_EXPORTED_FUNCTION size_t yp_unescape_calculate_difference(const char *value, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list);
|
41
|
+
|
42
|
+
#endif
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#ifndef YARP_BUFFER_H
|
2
|
+
#define YARP_BUFFER_H
|
3
|
+
|
4
|
+
#include "yarp/defines.h"
|
5
|
+
|
6
|
+
#include <assert.h>
|
7
|
+
#include <stdbool.h>
|
8
|
+
#include <stdint.h>
|
9
|
+
#include <stdlib.h>
|
10
|
+
#include <string.h>
|
11
|
+
|
12
|
+
// A yp_buffer_t is a simple memory buffer that stores data in a contiguous
|
13
|
+
// block of memory. It is used to store the serialized representation of a
|
14
|
+
// YARP tree.
|
15
|
+
typedef struct {
|
16
|
+
char *value;
|
17
|
+
size_t length;
|
18
|
+
size_t capacity;
|
19
|
+
} yp_buffer_t;
|
20
|
+
|
21
|
+
// Initialize a yp_buffer_t with its default values.
|
22
|
+
YP_EXPORTED_FUNCTION bool yp_buffer_init(yp_buffer_t *buffer);
|
23
|
+
|
24
|
+
// Append the given amount of space as zeroes to the buffer.
|
25
|
+
void yp_buffer_append_zeroes(yp_buffer_t *buffer, size_t length);
|
26
|
+
|
27
|
+
// Append a string to the buffer.
|
28
|
+
void yp_buffer_append_str(yp_buffer_t *buffer, const char *value, size_t length);
|
29
|
+
|
30
|
+
// Append a single byte to the buffer.
|
31
|
+
void yp_buffer_append_u8(yp_buffer_t *buffer, uint8_t value);
|
32
|
+
|
33
|
+
// Append a 32-bit unsigned integer to the buffer.
|
34
|
+
void yp_buffer_append_u32(yp_buffer_t *buffer, uint32_t value);
|
35
|
+
|
36
|
+
// Free the memory associated with the buffer.
|
37
|
+
YP_EXPORTED_FUNCTION void yp_buffer_free(yp_buffer_t *buffer);
|
38
|
+
|
39
|
+
#endif
|
@@ -0,0 +1,75 @@
|
|
1
|
+
#ifndef YP_CHAR_H
|
2
|
+
#define YP_CHAR_H
|
3
|
+
|
4
|
+
#include "yarp/defines.h"
|
5
|
+
#include "yarp/util/yp_newline_list.h"
|
6
|
+
|
7
|
+
#include <stdbool.h>
|
8
|
+
#include <stddef.h>
|
9
|
+
|
10
|
+
// Returns the number of characters at the start of the string that are
|
11
|
+
// whitespace. Disallows searching past the given maximum number of characters.
|
12
|
+
size_t yp_strspn_whitespace(const char *string, ptrdiff_t length);
|
13
|
+
|
14
|
+
// Returns the number of characters at the start of the string that are
|
15
|
+
// whitespace while also tracking the location of each newline. Disallows
|
16
|
+
// searching past the given maximum number of characters.
|
17
|
+
size_t
|
18
|
+
yp_strspn_whitespace_newlines(const char *string, ptrdiff_t length, yp_newline_list_t *newline_list, bool);
|
19
|
+
|
20
|
+
// Returns the number of characters at the start of the string that are inline
|
21
|
+
// whitespace. Disallows searching past the given maximum number of characters.
|
22
|
+
size_t yp_strspn_inline_whitespace(const char *string, ptrdiff_t length);
|
23
|
+
|
24
|
+
// Returns the number of characters at the start of the string that are decimal
|
25
|
+
// digits. Disallows searching past the given maximum number of characters.
|
26
|
+
size_t yp_strspn_decimal_digit(const char *string, ptrdiff_t length);
|
27
|
+
|
28
|
+
// Returns the number of characters at the start of the string that are
|
29
|
+
// hexadecimal digits. Disallows searching past the given maximum number of
|
30
|
+
// characters.
|
31
|
+
size_t yp_strspn_hexadecimal_digit(const char *string, ptrdiff_t length);
|
32
|
+
|
33
|
+
// Returns the number of characters at the start of the string that are octal
|
34
|
+
// digits or underscores. Disallows searching past the given maximum number of
|
35
|
+
// characters.
|
36
|
+
size_t yp_strspn_octal_number(const char *string, ptrdiff_t length);
|
37
|
+
|
38
|
+
// Returns the number of characters at the start of the string that are decimal
|
39
|
+
// digits or underscores. Disallows searching past the given maximum number of
|
40
|
+
// characters.
|
41
|
+
size_t yp_strspn_decimal_number(const char *string, ptrdiff_t length);
|
42
|
+
|
43
|
+
// Returns the number of characters at the start of the string that are
|
44
|
+
// hexadecimal digits or underscores. Disallows searching past the given maximum
|
45
|
+
// number of characters.
|
46
|
+
size_t yp_strspn_hexadecimal_number(const char *string, ptrdiff_t length);
|
47
|
+
|
48
|
+
// Returns the number of characters at the start of the string that are regexp
|
49
|
+
// options. Disallows searching past the given maximum number of characters.
|
50
|
+
size_t yp_strspn_regexp_option(const char *string, ptrdiff_t length);
|
51
|
+
|
52
|
+
// Returns the number of characters at the start of the string that are binary
|
53
|
+
// digits or underscores. Disallows searching past the given maximum number of
|
54
|
+
// characters.
|
55
|
+
size_t yp_strspn_binary_number(const char *string, ptrdiff_t length);
|
56
|
+
|
57
|
+
// Returns true if the given character is a whitespace character.
|
58
|
+
bool yp_char_is_whitespace(const char c);
|
59
|
+
|
60
|
+
// Returns true if the given character is an inline whitespace character.
|
61
|
+
bool yp_char_is_inline_whitespace(const char c);
|
62
|
+
|
63
|
+
// Returns true if the given character is a binary digit.
|
64
|
+
bool yp_char_is_binary_digit(const char c);
|
65
|
+
|
66
|
+
// Returns true if the given character is an octal digit.
|
67
|
+
bool yp_char_is_octal_digit(const char c);
|
68
|
+
|
69
|
+
// Returns true if the given character is a decimal digit.
|
70
|
+
bool yp_char_is_decimal_digit(const char c);
|
71
|
+
|
72
|
+
// Returns true if the given character is a hexadecimal digit.
|
73
|
+
bool yp_char_is_hexadecimal_digit(const char c);
|
74
|
+
|
75
|
+
#endif
|
@@ -0,0 +1,64 @@
|
|
1
|
+
// The constant pool is a data structure that stores a set of strings. Each
|
2
|
+
// string is assigned a unique id, which can be used to compare strings for
|
3
|
+
// equality. This comparison ends up being much faster than strcmp, since it
|
4
|
+
// only requires a single integer comparison.
|
5
|
+
|
6
|
+
#ifndef YP_CONSTANT_POOL_H
|
7
|
+
#define YP_CONSTANT_POOL_H
|
8
|
+
|
9
|
+
#include "yarp/defines.h"
|
10
|
+
|
11
|
+
#include <stdbool.h>
|
12
|
+
#include <stdint.h>
|
13
|
+
#include <stdlib.h>
|
14
|
+
#include <string.h>
|
15
|
+
|
16
|
+
typedef uint32_t yp_constant_id_t;
|
17
|
+
|
18
|
+
typedef struct {
|
19
|
+
yp_constant_id_t *ids;
|
20
|
+
size_t size;
|
21
|
+
size_t capacity;
|
22
|
+
} yp_constant_id_list_t;
|
23
|
+
|
24
|
+
// Initialize a list of constant ids.
|
25
|
+
void yp_constant_id_list_init(yp_constant_id_list_t *list);
|
26
|
+
|
27
|
+
// Append a constant id to a list of constant ids. Returns false if any
|
28
|
+
// potential reallocations fail.
|
29
|
+
bool yp_constant_id_list_append(yp_constant_id_list_t *list, yp_constant_id_t id);
|
30
|
+
|
31
|
+
// Checks if the current constant id list includes the given constant id.
|
32
|
+
bool
|
33
|
+
yp_constant_id_list_includes(yp_constant_id_list_t *list, yp_constant_id_t id);
|
34
|
+
|
35
|
+
// Get the memory size of a list of constant ids.
|
36
|
+
size_t yp_constant_id_list_memsize(yp_constant_id_list_t *list);
|
37
|
+
|
38
|
+
// Free the memory associated with a list of constant ids.
|
39
|
+
void yp_constant_id_list_free(yp_constant_id_list_t *list);
|
40
|
+
|
41
|
+
typedef struct {
|
42
|
+
yp_constant_id_t id;
|
43
|
+
const char *start;
|
44
|
+
size_t length;
|
45
|
+
size_t hash;
|
46
|
+
} yp_constant_t;
|
47
|
+
|
48
|
+
typedef struct {
|
49
|
+
yp_constant_t *constants;
|
50
|
+
size_t size;
|
51
|
+
size_t capacity;
|
52
|
+
} yp_constant_pool_t;
|
53
|
+
|
54
|
+
// Initialize a new constant pool with a given capacity.
|
55
|
+
bool yp_constant_pool_init(yp_constant_pool_t *pool, size_t capacity);
|
56
|
+
|
57
|
+
// Insert a constant into a constant pool. Returns the id of the constant, or 0
|
58
|
+
// if any potential calls to resize fail.
|
59
|
+
yp_constant_id_t yp_constant_pool_insert(yp_constant_pool_t *pool, const char *start, size_t length);
|
60
|
+
|
61
|
+
// Free the memory associated with a constant pool.
|
62
|
+
void yp_constant_pool_free(yp_constant_pool_t *pool);
|
63
|
+
|
64
|
+
#endif
|