prism 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +172 -0
  3. data/CODE_OF_CONDUCT.md +76 -0
  4. data/CONTRIBUTING.md +62 -0
  5. data/LICENSE.md +7 -0
  6. data/Makefile +84 -0
  7. data/README.md +89 -0
  8. data/config.yml +2481 -0
  9. data/docs/build_system.md +74 -0
  10. data/docs/building.md +22 -0
  11. data/docs/configuration.md +60 -0
  12. data/docs/design.md +53 -0
  13. data/docs/encoding.md +117 -0
  14. data/docs/fuzzing.md +93 -0
  15. data/docs/heredocs.md +36 -0
  16. data/docs/mapping.md +117 -0
  17. data/docs/ripper.md +36 -0
  18. data/docs/ruby_api.md +25 -0
  19. data/docs/serialization.md +181 -0
  20. data/docs/testing.md +55 -0
  21. data/ext/prism/api_node.c +4725 -0
  22. data/ext/prism/api_pack.c +256 -0
  23. data/ext/prism/extconf.rb +136 -0
  24. data/ext/prism/extension.c +626 -0
  25. data/ext/prism/extension.h +18 -0
  26. data/include/prism/ast.h +1932 -0
  27. data/include/prism/defines.h +45 -0
  28. data/include/prism/diagnostic.h +231 -0
  29. data/include/prism/enc/pm_encoding.h +95 -0
  30. data/include/prism/node.h +41 -0
  31. data/include/prism/pack.h +141 -0
  32. data/include/prism/parser.h +418 -0
  33. data/include/prism/regexp.h +19 -0
  34. data/include/prism/unescape.h +48 -0
  35. data/include/prism/util/pm_buffer.h +51 -0
  36. data/include/prism/util/pm_char.h +91 -0
  37. data/include/prism/util/pm_constant_pool.h +78 -0
  38. data/include/prism/util/pm_list.h +67 -0
  39. data/include/prism/util/pm_memchr.h +14 -0
  40. data/include/prism/util/pm_newline_list.h +61 -0
  41. data/include/prism/util/pm_state_stack.h +24 -0
  42. data/include/prism/util/pm_string.h +61 -0
  43. data/include/prism/util/pm_string_list.h +25 -0
  44. data/include/prism/util/pm_strpbrk.h +29 -0
  45. data/include/prism/version.h +4 -0
  46. data/include/prism.h +82 -0
  47. data/lib/prism/compiler.rb +465 -0
  48. data/lib/prism/debug.rb +157 -0
  49. data/lib/prism/desugar_compiler.rb +206 -0
  50. data/lib/prism/dispatcher.rb +2051 -0
  51. data/lib/prism/dsl.rb +750 -0
  52. data/lib/prism/ffi.rb +251 -0
  53. data/lib/prism/lex_compat.rb +838 -0
  54. data/lib/prism/mutation_compiler.rb +718 -0
  55. data/lib/prism/node.rb +14540 -0
  56. data/lib/prism/node_ext.rb +55 -0
  57. data/lib/prism/node_inspector.rb +68 -0
  58. data/lib/prism/pack.rb +185 -0
  59. data/lib/prism/parse_result/comments.rb +172 -0
  60. data/lib/prism/parse_result/newlines.rb +60 -0
  61. data/lib/prism/parse_result.rb +266 -0
  62. data/lib/prism/pattern.rb +239 -0
  63. data/lib/prism/ripper_compat.rb +174 -0
  64. data/lib/prism/serialize.rb +662 -0
  65. data/lib/prism/visitor.rb +470 -0
  66. data/lib/prism.rb +64 -0
  67. data/prism.gemspec +113 -0
  68. data/src/diagnostic.c +287 -0
  69. data/src/enc/pm_big5.c +52 -0
  70. data/src/enc/pm_euc_jp.c +58 -0
  71. data/src/enc/pm_gbk.c +61 -0
  72. data/src/enc/pm_shift_jis.c +56 -0
  73. data/src/enc/pm_tables.c +507 -0
  74. data/src/enc/pm_unicode.c +2324 -0
  75. data/src/enc/pm_windows_31j.c +56 -0
  76. data/src/node.c +2633 -0
  77. data/src/pack.c +493 -0
  78. data/src/prettyprint.c +2136 -0
  79. data/src/prism.c +14587 -0
  80. data/src/regexp.c +580 -0
  81. data/src/serialize.c +1899 -0
  82. data/src/token_type.c +349 -0
  83. data/src/unescape.c +637 -0
  84. data/src/util/pm_buffer.c +103 -0
  85. data/src/util/pm_char.c +272 -0
  86. data/src/util/pm_constant_pool.c +252 -0
  87. data/src/util/pm_list.c +41 -0
  88. data/src/util/pm_memchr.c +33 -0
  89. data/src/util/pm_newline_list.c +134 -0
  90. data/src/util/pm_state_stack.c +19 -0
  91. data/src/util/pm_string.c +200 -0
  92. data/src/util/pm_string_list.c +29 -0
  93. data/src/util/pm_strncasecmp.c +17 -0
  94. data/src/util/pm_strpbrk.c +66 -0
  95. metadata +138 -0
@@ -0,0 +1,418 @@
1
+ #ifndef PRISM_PARSER_H
2
+ #define PRISM_PARSER_H
3
+
4
+ #include "prism/ast.h"
5
+ #include "prism/defines.h"
6
+ #include "prism/enc/pm_encoding.h"
7
+ #include "prism/util/pm_constant_pool.h"
8
+ #include "prism/util/pm_list.h"
9
+ #include "prism/util/pm_newline_list.h"
10
+ #include "prism/util/pm_state_stack.h"
11
+
12
+ #include <stdbool.h>
13
+
14
+ // This enum provides various bits that represent different kinds of states that
15
+ // the lexer can track. This is used to determine which kind of token to return
16
+ // based on the context of the parser.
17
+ typedef enum {
18
+ PM_LEX_STATE_BIT_BEG,
19
+ PM_LEX_STATE_BIT_END,
20
+ PM_LEX_STATE_BIT_ENDARG,
21
+ PM_LEX_STATE_BIT_ENDFN,
22
+ PM_LEX_STATE_BIT_ARG,
23
+ PM_LEX_STATE_BIT_CMDARG,
24
+ PM_LEX_STATE_BIT_MID,
25
+ PM_LEX_STATE_BIT_FNAME,
26
+ PM_LEX_STATE_BIT_DOT,
27
+ PM_LEX_STATE_BIT_CLASS,
28
+ PM_LEX_STATE_BIT_LABEL,
29
+ PM_LEX_STATE_BIT_LABELED,
30
+ PM_LEX_STATE_BIT_FITEM
31
+ } pm_lex_state_bit_t;
32
+
33
+ // This enum combines the various bits from the above enum into individual
34
+ // values that represent the various states of the lexer.
35
+ typedef enum {
36
+ PM_LEX_STATE_NONE = 0,
37
+ PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG),
38
+ PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END),
39
+ PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG),
40
+ PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN),
41
+ PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG),
42
+ PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG),
43
+ PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID),
44
+ PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME),
45
+ PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT),
46
+ PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS),
47
+ PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL),
48
+ PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED),
49
+ PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM),
50
+ PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS,
51
+ PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG,
52
+ PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN
53
+ } pm_lex_state_t;
54
+
55
+ typedef enum {
56
+ PM_HEREDOC_QUOTE_NONE,
57
+ PM_HEREDOC_QUOTE_SINGLE = '\'',
58
+ PM_HEREDOC_QUOTE_DOUBLE = '"',
59
+ PM_HEREDOC_QUOTE_BACKTICK = '`',
60
+ } pm_heredoc_quote_t;
61
+
62
+ typedef enum {
63
+ PM_HEREDOC_INDENT_NONE,
64
+ PM_HEREDOC_INDENT_DASH,
65
+ PM_HEREDOC_INDENT_TILDE,
66
+ } pm_heredoc_indent_t;
67
+
68
+ // When lexing Ruby source, the lexer has a small amount of state to tell which
69
+ // kind of token it is currently lexing. For example, when we find the start of
70
+ // a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
71
+ // that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
72
+ // are found as part of a string.
73
+ typedef struct pm_lex_mode {
74
+ enum {
75
+ // This state is used when any given token is being lexed.
76
+ PM_LEX_DEFAULT,
77
+
78
+ // This state is used when we're lexing as normal but inside an embedded
79
+ // expression of a string.
80
+ PM_LEX_EMBEXPR,
81
+
82
+ // This state is used when we're lexing a variable that is embedded
83
+ // directly inside of a string with the # shorthand.
84
+ PM_LEX_EMBVAR,
85
+
86
+ // This state is used when you are inside the content of a heredoc.
87
+ PM_LEX_HEREDOC,
88
+
89
+ // This state is used when we are lexing a list of tokens, as in a %w
90
+ // word list literal or a %i symbol list literal.
91
+ PM_LEX_LIST,
92
+
93
+ // This state is used when a regular expression has been begun and we
94
+ // are looking for the terminator.
95
+ PM_LEX_REGEXP,
96
+
97
+ // This state is used when we are lexing a string or a string-like
98
+ // token, as in string content with either quote or an xstring.
99
+ PM_LEX_STRING
100
+ } mode;
101
+
102
+ union {
103
+ struct {
104
+ // This keeps track of the nesting level of the list.
105
+ size_t nesting;
106
+
107
+ // Whether or not interpolation is allowed in this list.
108
+ bool interpolation;
109
+
110
+ // When lexing a list, it takes into account balancing the
111
+ // terminator if the terminator is one of (), [], {}, or <>.
112
+ uint8_t incrementor;
113
+
114
+ // This is the terminator of the list literal.
115
+ uint8_t terminator;
116
+
117
+ // This is the character set that should be used to delimit the
118
+ // tokens within the list.
119
+ uint8_t breakpoints[11];
120
+ } list;
121
+
122
+ struct {
123
+ // This keeps track of the nesting level of the regular expression.
124
+ size_t nesting;
125
+
126
+ // When lexing a regular expression, it takes into account balancing
127
+ // the terminator if the terminator is one of (), [], {}, or <>.
128
+ uint8_t incrementor;
129
+
130
+ // This is the terminator of the regular expression.
131
+ uint8_t terminator;
132
+
133
+ // This is the character set that should be used to delimit the
134
+ // tokens within the regular expression.
135
+ uint8_t breakpoints[6];
136
+ } regexp;
137
+
138
+ struct {
139
+ // This keeps track of the nesting level of the string.
140
+ size_t nesting;
141
+
142
+ // Whether or not interpolation is allowed in this string.
143
+ bool interpolation;
144
+
145
+ // Whether or not at the end of the string we should allow a :,
146
+ // which would indicate this was a dynamic symbol instead of a
147
+ // string.
148
+ bool label_allowed;
149
+
150
+ // When lexing a string, it takes into account balancing the
151
+ // terminator if the terminator is one of (), [], {}, or <>.
152
+ uint8_t incrementor;
153
+
154
+ // This is the terminator of the string. It is typically either a
155
+ // single or double quote.
156
+ uint8_t terminator;
157
+
158
+ // This is the character set that should be used to delimit the
159
+ // tokens within the string.
160
+ uint8_t breakpoints[6];
161
+ } string;
162
+
163
+ struct {
164
+ // These pointers point to the beginning and end of the heredoc
165
+ // identifier.
166
+ const uint8_t *ident_start;
167
+ size_t ident_length;
168
+
169
+ pm_heredoc_quote_t quote;
170
+ pm_heredoc_indent_t indent;
171
+
172
+ // This is the pointer to the character where lexing should resume
173
+ // once the heredoc has been completely processed.
174
+ const uint8_t *next_start;
175
+ } heredoc;
176
+ } as;
177
+
178
+ // The previous lex state so that it knows how to pop.
179
+ struct pm_lex_mode *prev;
180
+ } pm_lex_mode_t;
181
+
182
+ // We pre-allocate a certain number of lex states in order to avoid having to
183
+ // call malloc too many times while parsing. You really shouldn't need more than
184
+ // this because you only really nest deeply when doing string interpolation.
185
+ #define PM_LEX_STACK_SIZE 4
186
+
187
+ // A forward declaration since our error handler struct accepts a parser for
188
+ // each of its function calls.
189
+ typedef struct pm_parser pm_parser_t;
190
+
191
+ // While parsing, we keep track of a stack of contexts. This is helpful for
192
+ // error recovery so that we can pop back to a previous context when we hit a
193
+ // token that is understood by a parent context but not by the current context.
194
+ typedef enum {
195
+ PM_CONTEXT_BEGIN, // a begin statement
196
+ PM_CONTEXT_BLOCK_BRACES, // expressions in block arguments using braces
197
+ PM_CONTEXT_BLOCK_KEYWORDS, // expressions in block arguments using do..end
198
+ PM_CONTEXT_CASE_WHEN, // a case when statements
199
+ PM_CONTEXT_CASE_IN, // a case in statements
200
+ PM_CONTEXT_CLASS, // a class declaration
201
+ PM_CONTEXT_DEF, // a method definition
202
+ PM_CONTEXT_DEF_PARAMS, // a method definition's parameters
203
+ PM_CONTEXT_DEFAULT_PARAMS, // a method definition's default parameter
204
+ PM_CONTEXT_ELSE, // an else clause
205
+ PM_CONTEXT_ELSIF, // an elsif clause
206
+ PM_CONTEXT_EMBEXPR, // an interpolated expression
207
+ PM_CONTEXT_ENSURE, // an ensure statement
208
+ PM_CONTEXT_FOR, // a for loop
209
+ PM_CONTEXT_IF, // an if statement
210
+ PM_CONTEXT_LAMBDA_BRACES, // a lambda expression with braces
211
+ PM_CONTEXT_LAMBDA_DO_END, // a lambda expression with do..end
212
+ PM_CONTEXT_MAIN, // the top level context
213
+ PM_CONTEXT_MODULE, // a module declaration
214
+ PM_CONTEXT_PARENS, // a parenthesized expression
215
+ PM_CONTEXT_POSTEXE, // an END block
216
+ PM_CONTEXT_PREDICATE, // a predicate inside an if/elsif/unless statement
217
+ PM_CONTEXT_PREEXE, // a BEGIN block
218
+ PM_CONTEXT_RESCUE_ELSE, // a rescue else statement
219
+ PM_CONTEXT_RESCUE, // a rescue statement
220
+ PM_CONTEXT_SCLASS, // a singleton class definition
221
+ PM_CONTEXT_UNLESS, // an unless statement
222
+ PM_CONTEXT_UNTIL, // an until statement
223
+ PM_CONTEXT_WHILE, // a while statement
224
+ } pm_context_t;
225
+
226
+ // This is a node in a linked list of contexts.
227
+ typedef struct pm_context_node {
228
+ pm_context_t context;
229
+ struct pm_context_node *prev;
230
+ } pm_context_node_t;
231
+
232
+ // This is the type of a comment that we've found while parsing.
233
+ typedef enum {
234
+ PM_COMMENT_INLINE,
235
+ PM_COMMENT_EMBDOC,
236
+ PM_COMMENT___END__
237
+ } pm_comment_type_t;
238
+
239
+ // This is a node in the linked list of comments that we've found while parsing.
240
+ typedef struct pm_comment {
241
+ pm_list_node_t node;
242
+ const uint8_t *start;
243
+ const uint8_t *end;
244
+ pm_comment_type_t type;
245
+ } pm_comment_t;
246
+
247
+ // When the encoding that is being used to parse the source is changed by prism,
248
+ // we provide the ability here to call out to a user-defined function.
249
+ typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser);
250
+
251
+ // When an encoding is encountered that isn't understood by prism, we provide
252
+ // the ability here to call out to a user-defined function to get an encoding
253
+ // struct. If the function returns something that isn't NULL, we set that to
254
+ // our encoding and use it to parse identifiers.
255
+ typedef pm_encoding_t *(*pm_encoding_decode_callback_t)(pm_parser_t *parser, const uint8_t *name, size_t width);
256
+
257
+ // When you are lexing through a file, the lexer needs all of the information
258
+ // that the parser additionally provides (for example, the local table). So if
259
+ // you want to properly lex Ruby, you need to actually lex it in the context of
260
+ // the parser. In order to provide this functionality, we optionally allow a
261
+ // struct to be attached to the parser that calls back out to a user-provided
262
+ // callback when each token is lexed.
263
+ typedef struct {
264
+ // This opaque pointer is used to provide whatever information the user
265
+ // deemed necessary to the callback. In our case we use it to pass the array
266
+ // that the tokens get appended into.
267
+ void *data;
268
+
269
+ // This is the callback that is called when a token is lexed. It is passed
270
+ // the opaque data pointer, the parser, and the token that was lexed.
271
+ void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token);
272
+ } pm_lex_callback_t;
273
+
274
+ // This struct represents a node in a linked list of scopes. Some scopes can see
275
+ // into their parent scopes, while others cannot.
276
+ typedef struct pm_scope {
277
+ // The IDs of the locals in the given scope.
278
+ pm_constant_id_list_t locals;
279
+
280
+ // A pointer to the previous scope in the linked list.
281
+ struct pm_scope *previous;
282
+
283
+ // A boolean indicating whether or not this scope can see into its parent.
284
+ // If closed is true, then the scope cannot see into its parent.
285
+ bool closed;
286
+
287
+ // A boolean indicating whether or not this scope has explicit parameters.
288
+ // This is necessary to determine whether or not numbered parameters are
289
+ // allowed.
290
+ bool explicit_params;
291
+
292
+ // A boolean indicating whether or not this scope has numbered parameters.
293
+ // This is necessary to determine if child blocks are allowed to use
294
+ // numbered parameters.
295
+ bool numbered_params;
296
+ } pm_scope_t;
297
+
298
+ // This struct represents the overall parser. It contains a reference to the
299
+ // source file, as well as pointers that indicate where in the source it's
300
+ // currently parsing. It also contains the most recent and current token that
301
+ // it's considering.
302
+ struct pm_parser {
303
+ pm_lex_state_t lex_state; // the current state of the lexer
304
+ int enclosure_nesting; // tracks the current nesting of (), [], and {}
305
+
306
+ // Used to temporarily track the nesting of enclosures to determine if a {
307
+ // is the beginning of a lambda following the parameters of a lambda.
308
+ int lambda_enclosure_nesting;
309
+
310
+ // Used to track the nesting of braces to ensure we get the correct value
311
+ // when we are interpolating blocks with braces.
312
+ int brace_nesting;
313
+
314
+ // the stack used to determine if a do keyword belongs to the predicate of a
315
+ // while, until, or for loop
316
+ pm_state_stack_t do_loop_stack;
317
+
318
+ // the stack used to determine if a do keyword belongs to the beginning of a
319
+ // block
320
+ pm_state_stack_t accepts_block_stack;
321
+
322
+ struct {
323
+ pm_lex_mode_t *current; // the current mode of the lexer
324
+ pm_lex_mode_t stack[PM_LEX_STACK_SIZE]; // the stack of lexer modes
325
+ size_t index; // the current index into the lexer mode stack
326
+ } lex_modes;
327
+
328
+ const uint8_t *start; // the pointer to the start of the source
329
+ const uint8_t *end; // the pointer to the end of the source
330
+ pm_token_t previous; // the previous token we were considering
331
+ pm_token_t current; // the current token we're considering
332
+
333
+ // This is a special field set on the parser when we need the parser to jump
334
+ // to a specific location when lexing the next token, as opposed to just
335
+ // using the end of the previous token. Normally this is NULL.
336
+ const uint8_t *next_start;
337
+
338
+ // This field indicates the end of a heredoc whose identifier was found on
339
+ // the current line. If another heredoc is found on the same line, then this
340
+ // will be moved forward to the end of that heredoc. If no heredocs are
341
+ // found on a line then this is NULL.
342
+ const uint8_t *heredoc_end;
343
+
344
+ pm_list_t comment_list; // the list of comments that have been found while parsing
345
+ pm_list_t warning_list; // the list of warnings that have been found while parsing
346
+ pm_list_t error_list; // the list of errors that have been found while parsing
347
+ pm_scope_t *current_scope; // the current local scope
348
+
349
+ pm_context_node_t *current_context; // the current parsing context
350
+
351
+ // The encoding functions for the current file is attached to the parser as
352
+ // it's parsing so that it can change with a magic comment.
353
+ pm_encoding_t encoding;
354
+
355
+ // When the encoding that is being used to parse the source is changed by
356
+ // prism, we provide the ability here to call out to a user-defined
357
+ // function.
358
+ pm_encoding_changed_callback_t encoding_changed_callback;
359
+
360
+ // When an encoding is encountered that isn't understood by prism, we
361
+ // provide the ability here to call out to a user-defined function to get an
362
+ // encoding struct. If the function returns something that isn't NULL, we
363
+ // set that to our encoding and use it to parse identifiers.
364
+ pm_encoding_decode_callback_t encoding_decode_callback;
365
+
366
+ // This pointer indicates where a comment must start if it is to be
367
+ // considered an encoding comment.
368
+ const uint8_t *encoding_comment_start;
369
+
370
+ // This is an optional callback that can be attached to the parser that will
371
+ // be called whenever a new token is lexed by the parser.
372
+ pm_lex_callback_t *lex_callback;
373
+
374
+ // This is the path of the file being parsed
375
+ // We use the filepath when constructing SourceFileNodes
376
+ pm_string_t filepath_string;
377
+
378
+ // This constant pool keeps all of the constants defined throughout the file
379
+ // so that we can reference them later.
380
+ pm_constant_pool_t constant_pool;
381
+
382
+ // This is the list of newline offsets in the source file.
383
+ pm_newline_list_t newline_list;
384
+
385
+ // We want to add a flag to integer nodes that indicates their base. We only
386
+ // want to parse these once, but we don't have space on the token itself to
387
+ // communicate this information. So we store it here and pass it through
388
+ // when we find tokens that we need it for.
389
+ pm_node_flags_t integer_base;
390
+
391
+ // Whether or not we're at the beginning of a command
392
+ bool command_start;
393
+
394
+ // Whether or not we're currently recovering from a syntax error
395
+ bool recovering;
396
+
397
+ // Whether or not the encoding has been changed by a magic comment. We use
398
+ // this to provide a fast path for the lexer instead of going through the
399
+ // function pointer.
400
+ bool encoding_changed;
401
+
402
+ // This flag indicates that we are currently parsing a pattern matching
403
+ // expression and impacts that calculation of newlines.
404
+ bool pattern_matching_newlines;
405
+
406
+ // This flag indicates that we are currently parsing a keyword argument.
407
+ bool in_keyword_arg;
408
+
409
+ // Whether or not the parser has seen a token that has semantic meaning
410
+ // (i.e., a token that is not a comment or whitespace).
411
+ bool semantic_token_seen;
412
+
413
+ // Whether or not we have found a frozen_string_literal magic comment with
414
+ // a true value.
415
+ bool frozen_string_literal;
416
+ };
417
+
418
+ #endif // PRISM_PARSER_H
@@ -0,0 +1,19 @@
1
+ #ifndef PRISM_REGEXP_H
2
+ #define PRISM_REGEXP_H
3
+
4
+ #include "prism/defines.h"
5
+ #include "prism/parser.h"
6
+ #include "prism/enc/pm_encoding.h"
7
+ #include "prism/util/pm_memchr.h"
8
+ #include "prism/util/pm_string_list.h"
9
+ #include "prism/util/pm_string.h"
10
+
11
+ #include <stdbool.h>
12
+ #include <stddef.h>
13
+ #include <string.h>
14
+
15
+ // Parse a regular expression and extract the names of all of the named capture
16
+ // groups.
17
+ PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding);
18
+
19
+ #endif
@@ -0,0 +1,48 @@
1
+ #ifndef PRISM_UNESCAPE_H
2
+ #define PRISM_UNESCAPE_H
3
+
4
+ #include "prism/defines.h"
5
+ #include "prism/diagnostic.h"
6
+ #include "prism/parser.h"
7
+ #include "prism/util/pm_char.h"
8
+ #include "prism/util/pm_list.h"
9
+ #include "prism/util/pm_memchr.h"
10
+ #include "prism/util/pm_string.h"
11
+
12
+ #include <assert.h>
13
+ #include <stdbool.h>
14
+ #include <stdint.h>
15
+ #include <string.h>
16
+
17
+ // The type of unescape we are performing.
18
+ typedef enum {
19
+ // When we're creating a string inside of a list literal like %w, we
20
+ // shouldn't escape anything.
21
+ PM_UNESCAPE_NONE,
22
+
23
+ // When we're unescaping a single-quoted string, we only need to unescape
24
+ // single quotes and backslashes.
25
+ PM_UNESCAPE_MINIMAL,
26
+
27
+ // When we're unescaping a string list, in addition to MINIMAL, we need to
28
+ // unescape whitespace.
29
+ PM_UNESCAPE_WHITESPACE,
30
+
31
+ // When we're unescaping a double-quoted string, we need to unescape all
32
+ // escapes.
33
+ PM_UNESCAPE_ALL,
34
+ } pm_unescape_type_t;
35
+
36
+ // Unescape the contents of the given token into the given string using the given unescape mode.
37
+ PRISM_EXPORTED_FUNCTION void pm_unescape_manipulate_string(pm_parser_t *parser, pm_string_t *string, pm_unescape_type_t unescape_type);
38
+ void pm_unescape_manipulate_char_literal(pm_parser_t *parser, pm_string_t *string, pm_unescape_type_t unescape_type);
39
+
40
+ // Accepts a source string and a type of unescaping and returns the unescaped version.
41
+ // The caller must pm_string_free(result); after calling this function.
42
+ PRISM_EXPORTED_FUNCTION bool pm_unescape_string(const uint8_t *start, size_t length, pm_unescape_type_t unescape_type, pm_string_t *result);
43
+
44
+ // Returns the number of bytes that encompass the first escape sequence in the
45
+ // given string.
46
+ size_t pm_unescape_calculate_difference(pm_parser_t *parser, const uint8_t *value, pm_unescape_type_t unescape_type, bool expect_single_codepoint);
47
+
48
+ #endif
@@ -0,0 +1,51 @@
1
+ #ifndef PRISM_BUFFER_H
2
+ #define PRISM_BUFFER_H
3
+
4
+ #include "prism/defines.h"
5
+
6
+ #include <assert.h>
7
+ #include <stdbool.h>
8
+ #include <stdint.h>
9
+ #include <stdlib.h>
10
+ #include <string.h>
11
+
12
+ // A pm_buffer_t is a simple memory buffer that stores data in a contiguous
13
+ // block of memory. It is used to store the serialized representation of a
14
+ // prism tree.
15
+ typedef struct {
16
+ char *value;
17
+ size_t length;
18
+ size_t capacity;
19
+ } pm_buffer_t;
20
+
21
+ // Return the size of the pm_buffer_t struct.
22
+ PRISM_EXPORTED_FUNCTION size_t pm_buffer_sizeof(void);
23
+
24
+ // Initialize a pm_buffer_t with its default values.
25
+ PRISM_EXPORTED_FUNCTION bool pm_buffer_init(pm_buffer_t *buffer);
26
+
27
+ // Return the value of the buffer.
28
+ PRISM_EXPORTED_FUNCTION char * pm_buffer_value(pm_buffer_t *buffer);
29
+
30
+ // Return the length of the buffer.
31
+ PRISM_EXPORTED_FUNCTION size_t pm_buffer_length(pm_buffer_t *buffer);
32
+
33
+ // Append the given amount of space as zeroes to the buffer.
34
+ void pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length);
35
+
36
+ // Append a string to the buffer.
37
+ void pm_buffer_append_str(pm_buffer_t *buffer, const char *value, size_t length);
38
+
39
+ // Append a list of bytes to the buffer.
40
+ void pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length);
41
+
42
+ // Append a single byte to the buffer.
43
+ void pm_buffer_append_u8(pm_buffer_t *buffer, uint8_t value);
44
+
45
+ // Append a 32-bit unsigned integer to the buffer.
46
+ void pm_buffer_append_u32(pm_buffer_t *buffer, uint32_t value);
47
+
48
+ // Free the memory associated with the buffer.
49
+ PRISM_EXPORTED_FUNCTION void pm_buffer_free(pm_buffer_t *buffer);
50
+
51
+ #endif
@@ -0,0 +1,91 @@
1
+ #ifndef PRISM_CHAR_H
2
+ #define PRISM_CHAR_H
3
+
4
+ #include "prism/defines.h"
5
+ #include "prism/util/pm_newline_list.h"
6
+
7
+ #include <stdbool.h>
8
+ #include <stddef.h>
9
+
10
+ // Returns the number of characters at the start of the string that are
11
+ // whitespace. Disallows searching past the given maximum number of characters.
12
+ size_t pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length);
13
+
14
+ // Returns the number of characters at the start of the string that are
15
+ // whitespace while also tracking the location of each newline. Disallows
16
+ // searching past the given maximum number of characters.
17
+ size_t
18
+ pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list);
19
+
20
+ // Returns the number of characters at the start of the string that are inline
21
+ // whitespace. Disallows searching past the given maximum number of characters.
22
+ size_t pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length);
23
+
24
+ // Returns the number of characters at the start of the string that are decimal
25
+ // digits. Disallows searching past the given maximum number of characters.
26
+ size_t pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length);
27
+
28
+ // Returns the number of characters at the start of the string that are
29
+ // hexadecimal digits. Disallows searching past the given maximum number of
30
+ // characters.
31
+ size_t pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length);
32
+
33
+ // Returns the number of characters at the start of the string that are octal
34
+ // digits or underscores. Disallows searching past the given maximum number of
35
+ // characters.
36
+ //
37
+ // If multiple underscores are found in a row or if an underscore is
38
+ // found at the end of the number, then the invalid pointer is set to the index
39
+ // of the first invalid underscore.
40
+ size_t pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
41
+
42
+ // Returns the number of characters at the start of the string that are decimal
43
+ // digits or underscores. Disallows searching past the given maximum number of
44
+ // characters.
45
+ //
46
+ // If multiple underscores are found in a row or if an underscore is
47
+ // found at the end of the number, then the invalid pointer is set to the index
48
+ // of the first invalid underscore.
49
+ size_t pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
50
+
51
+ // Returns the number of characters at the start of the string that are
52
+ // hexadecimal digits or underscores. Disallows searching past the given maximum
53
+ // number of characters.
54
+ //
55
+ // If multiple underscores are found in a row or if an underscore is
56
+ // found at the end of the number, then the invalid pointer is set to the index
57
+ // of the first invalid underscore.
58
+ size_t pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
59
+
60
+ // Returns the number of characters at the start of the string that are regexp
61
+ // options. Disallows searching past the given maximum number of characters.
62
+ size_t pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length);
63
+
64
+ // Returns the number of characters at the start of the string that are binary
65
+ // digits or underscores. Disallows searching past the given maximum number of
66
+ // characters.
67
+ //
68
+ // If multiple underscores are found in a row or if an underscore is
69
+ // found at the end of the number, then the invalid pointer is set to the index
70
+ // of the first invalid underscore.
71
+ size_t pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
72
+
73
+ // Returns true if the given character is a whitespace character.
74
+ bool pm_char_is_whitespace(const uint8_t b);
75
+
76
+ // Returns true if the given character is an inline whitespace character.
77
+ bool pm_char_is_inline_whitespace(const uint8_t b);
78
+
79
+ // Returns true if the given character is a binary digit.
80
+ bool pm_char_is_binary_digit(const uint8_t b);
81
+
82
+ // Returns true if the given character is an octal digit.
83
+ bool pm_char_is_octal_digit(const uint8_t b);
84
+
85
+ // Returns true if the given character is a decimal digit.
86
+ bool pm_char_is_decimal_digit(const uint8_t b);
87
+
88
+ // Returns true if the given character is a hexadecimal digit.
89
+ bool pm_char_is_hexadecimal_digit(const uint8_t b);
90
+
91
+ #endif