prism 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -1
- data/README.md +2 -1
- data/config.yml +188 -55
- data/docs/building.md +9 -2
- data/docs/configuration.md +10 -9
- data/docs/encoding.md +24 -56
- data/docs/local_variable_depth.md +229 -0
- data/docs/ruby_api.md +2 -0
- data/docs/serialization.md +18 -13
- data/ext/prism/api_node.c +337 -195
- data/ext/prism/extconf.rb +13 -7
- data/ext/prism/extension.c +96 -32
- data/ext/prism/extension.h +1 -1
- data/include/prism/ast.h +340 -137
- data/include/prism/defines.h +17 -0
- data/include/prism/diagnostic.h +11 -5
- data/include/prism/encoding.h +248 -0
- data/include/prism/options.h +2 -2
- data/include/prism/parser.h +62 -42
- data/include/prism/regexp.h +2 -2
- data/include/prism/util/pm_buffer.h +9 -1
- data/include/prism/util/pm_memchr.h +2 -2
- data/include/prism/util/pm_strpbrk.h +3 -3
- data/include/prism/version.h +2 -2
- data/include/prism.h +13 -15
- data/lib/prism/compiler.rb +12 -0
- data/lib/prism/debug.rb +9 -4
- data/lib/prism/desugar_compiler.rb +3 -3
- data/lib/prism/dispatcher.rb +56 -0
- data/lib/prism/dot_visitor.rb +476 -198
- data/lib/prism/dsl.rb +66 -46
- data/lib/prism/ffi.rb +16 -3
- data/lib/prism/lex_compat.rb +19 -9
- data/lib/prism/mutation_compiler.rb +20 -0
- data/lib/prism/node.rb +1173 -450
- data/lib/prism/node_ext.rb +41 -16
- data/lib/prism/parse_result.rb +12 -15
- data/lib/prism/ripper_compat.rb +49 -34
- data/lib/prism/serialize.rb +242 -212
- data/lib/prism/visitor.rb +12 -0
- data/lib/prism.rb +20 -4
- data/prism.gemspec +4 -10
- data/rbi/prism.rbi +605 -230
- data/rbi/prism_static.rbi +3 -0
- data/sig/prism.rbs +379 -124
- data/sig/prism_static.rbs +1 -0
- data/src/diagnostic.c +228 -222
- data/src/encoding.c +5137 -0
- data/src/node.c +66 -0
- data/src/options.c +21 -2
- data/src/prettyprint.c +806 -406
- data/src/prism.c +1092 -700
- data/src/regexp.c +3 -3
- data/src/serialize.c +227 -157
- data/src/util/pm_buffer.c +10 -1
- data/src/util/pm_memchr.c +1 -1
- data/src/util/pm_strpbrk.c +4 -4
- metadata +5 -11
- data/include/prism/enc/pm_encoding.h +0 -227
- data/src/enc/pm_big5.c +0 -116
- data/src/enc/pm_cp51932.c +0 -57
- data/src/enc/pm_euc_jp.c +0 -69
- data/src/enc/pm_gbk.c +0 -65
- data/src/enc/pm_shift_jis.c +0 -57
- data/src/enc/pm_tables.c +0 -2073
- data/src/enc/pm_unicode.c +0 -2369
- data/src/enc/pm_windows_31j.c +0 -57
data/include/prism/defines.h
CHANGED
@@ -74,4 +74,21 @@
|
|
74
74
|
# define snprintf _snprintf
|
75
75
|
#endif
|
76
76
|
|
77
|
+
/**
|
78
|
+
* A simple utility macro to concatenate two tokens together, necessary when one
|
79
|
+
* of the tokens is itself a macro.
|
80
|
+
*/
|
81
|
+
#define PM_CONCATENATE(left, right) left ## right
|
82
|
+
|
83
|
+
/**
|
84
|
+
* We want to be able to use static assertions, but they weren't standardized
|
85
|
+
* until C11. As such, we polyfill it here by making a hacky typedef that will
|
86
|
+
* fail to compile due to a negative array size if the condition is false.
|
87
|
+
*/
|
88
|
+
#if defined(_Static_assert)
|
89
|
+
# define PM_STATIC_ASSERT(line, condition, message) _Static_assert(condition, message)
|
90
|
+
#else
|
91
|
+
# define PM_STATIC_ASSERT(line, condition, message) typedef char PM_CONCATENATE(static_assert_, line)[(condition) ? 1 : -1]
|
92
|
+
#endif
|
93
|
+
|
77
94
|
#endif
|
data/include/prism/diagnostic.h
CHANGED
@@ -6,6 +6,7 @@
|
|
6
6
|
#ifndef PRISM_DIAGNOSTIC_H
|
7
7
|
#define PRISM_DIAGNOSTIC_H
|
8
8
|
|
9
|
+
#include "prism/ast.h"
|
9
10
|
#include "prism/defines.h"
|
10
11
|
#include "prism/util/pm_list.h"
|
11
12
|
|
@@ -22,11 +23,8 @@ typedef struct {
|
|
22
23
|
/** The embedded base node. */
|
23
24
|
pm_list_node_t node;
|
24
25
|
|
25
|
-
/**
|
26
|
-
|
27
|
-
|
28
|
-
/** A pointer to the end of the source that generated the diagnostic. */
|
29
|
-
const uint8_t *end;
|
26
|
+
/** The location of the diagnostic in the source. */
|
27
|
+
pm_location_t location;
|
30
28
|
|
31
29
|
/** The message associated with the diagnostic. */
|
32
30
|
const char *message;
|
@@ -179,6 +177,7 @@ typedef enum {
|
|
179
177
|
PM_ERR_LIST_W_UPPER_ELEMENT,
|
180
178
|
PM_ERR_LIST_W_UPPER_TERM,
|
181
179
|
PM_ERR_MALLOC_FAILED,
|
180
|
+
PM_ERR_MIXED_ENCODING,
|
182
181
|
PM_ERR_MODULE_IN_METHOD,
|
183
182
|
PM_ERR_MODULE_NAME,
|
184
183
|
PM_ERR_MODULE_TERM,
|
@@ -192,6 +191,7 @@ typedef enum {
|
|
192
191
|
PM_ERR_OPERATOR_WRITE_BLOCK,
|
193
192
|
PM_ERR_PARAMETER_ASSOC_SPLAT_MULTI,
|
194
193
|
PM_ERR_PARAMETER_BLOCK_MULTI,
|
194
|
+
PM_ERR_PARAMETER_CIRCULAR,
|
195
195
|
PM_ERR_PARAMETER_METHOD_NAME,
|
196
196
|
PM_ERR_PARAMETER_NAME_REPEAT,
|
197
197
|
PM_ERR_PARAMETER_NO_DEFAULT,
|
@@ -211,6 +211,7 @@ typedef enum {
|
|
211
211
|
PM_ERR_PATTERN_EXPRESSION_AFTER_PIN,
|
212
212
|
PM_ERR_PATTERN_EXPRESSION_AFTER_PIPE,
|
213
213
|
PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE,
|
214
|
+
PM_ERR_PATTERN_EXPRESSION_AFTER_REST,
|
214
215
|
PM_ERR_PATTERN_HASH_KEY,
|
215
216
|
PM_ERR_PATTERN_HASH_KEY_LABEL,
|
216
217
|
PM_ERR_PATTERN_IDENT_AFTER_HROCKET,
|
@@ -226,6 +227,10 @@ typedef enum {
|
|
226
227
|
PM_ERR_RESCUE_TERM,
|
227
228
|
PM_ERR_RESCUE_VARIABLE,
|
228
229
|
PM_ERR_RETURN_INVALID,
|
230
|
+
PM_ERR_STATEMENT_ALIAS,
|
231
|
+
PM_ERR_STATEMENT_POSTEXE_END,
|
232
|
+
PM_ERR_STATEMENT_PREEXE_BEGIN,
|
233
|
+
PM_ERR_STATEMENT_UNDEF,
|
229
234
|
PM_ERR_STRING_CONCATENATION,
|
230
235
|
PM_ERR_STRING_INTERPOLATED_TERM,
|
231
236
|
PM_ERR_STRING_LITERAL_TERM,
|
@@ -243,6 +248,7 @@ typedef enum {
|
|
243
248
|
PM_ERR_UNTIL_TERM,
|
244
249
|
PM_ERR_VOID_EXPRESSION,
|
245
250
|
PM_ERR_WHILE_TERM,
|
251
|
+
PM_ERR_WRITE_TARGET_IN_METHOD,
|
246
252
|
PM_ERR_WRITE_TARGET_READONLY,
|
247
253
|
PM_ERR_WRITE_TARGET_UNEXPECTED,
|
248
254
|
PM_ERR_XSTRING_TERM,
|
@@ -0,0 +1,248 @@
|
|
1
|
+
/**
|
2
|
+
* @file encoding.h
|
3
|
+
*
|
4
|
+
* The encoding interface and implementations used by the parser.
|
5
|
+
*/
|
6
|
+
#ifndef PRISM_ENCODING_H
|
7
|
+
#define PRISM_ENCODING_H
|
8
|
+
|
9
|
+
#include "prism/defines.h"
|
10
|
+
#include "prism/util/pm_strncasecmp.h"
|
11
|
+
|
12
|
+
#include <assert.h>
|
13
|
+
#include <stdbool.h>
|
14
|
+
#include <stddef.h>
|
15
|
+
#include <stdint.h>
|
16
|
+
|
17
|
+
/**
|
18
|
+
* This struct defines the functions necessary to implement the encoding
|
19
|
+
* interface so we can determine how many bytes the subsequent character takes.
|
20
|
+
* Each callback should return the number of bytes, or 0 if the next bytes are
|
21
|
+
* invalid for the encoding and type.
|
22
|
+
*/
|
23
|
+
typedef struct {
|
24
|
+
/**
|
25
|
+
* Return the number of bytes that the next character takes if it is valid
|
26
|
+
* in the encoding. Does not read more than n bytes. It is assumed that n is
|
27
|
+
* at least 1.
|
28
|
+
*/
|
29
|
+
size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
|
30
|
+
|
31
|
+
/**
|
32
|
+
* Return the number of bytes that the next character takes if it is valid
|
33
|
+
* in the encoding and is alphabetical. Does not read more than n bytes. It
|
34
|
+
* is assumed that n is at least 1.
|
35
|
+
*/
|
36
|
+
size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
|
37
|
+
|
38
|
+
/**
|
39
|
+
* Return the number of bytes that the next character takes if it is valid
|
40
|
+
* in the encoding and is alphanumeric. Does not read more than n bytes. It
|
41
|
+
* is assumed that n is at least 1.
|
42
|
+
*/
|
43
|
+
size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
|
44
|
+
|
45
|
+
/**
|
46
|
+
* Return true if the next character is valid in the encoding and is an
|
47
|
+
* uppercase character. Does not read more than n bytes. It is assumed that
|
48
|
+
* n is at least 1.
|
49
|
+
*/
|
50
|
+
bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
|
51
|
+
|
52
|
+
/**
|
53
|
+
* The name of the encoding. This should correspond to a value that can be
|
54
|
+
* passed to Encoding.find in Ruby.
|
55
|
+
*/
|
56
|
+
const char *name;
|
57
|
+
|
58
|
+
/**
|
59
|
+
* Return true if the encoding is a multibyte encoding.
|
60
|
+
*/
|
61
|
+
bool multibyte;
|
62
|
+
} pm_encoding_t;
|
63
|
+
|
64
|
+
/**
|
65
|
+
* All of the lookup tables use the first bit of each embedded byte to indicate
|
66
|
+
* whether the codepoint is alphabetical.
|
67
|
+
*/
|
68
|
+
#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
|
69
|
+
|
70
|
+
/**
|
71
|
+
* All of the lookup tables use the second bit of each embedded byte to indicate
|
72
|
+
* whether the codepoint is alphanumeric.
|
73
|
+
*/
|
74
|
+
#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
|
75
|
+
|
76
|
+
/**
|
77
|
+
* All of the lookup tables use the third bit of each embedded byte to indicate
|
78
|
+
* whether the codepoint is uppercase.
|
79
|
+
*/
|
80
|
+
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
|
81
|
+
|
82
|
+
/**
|
83
|
+
* Return the size of the next character in the UTF-8 encoding if it is an
|
84
|
+
* alphabetical character.
|
85
|
+
*
|
86
|
+
* @param b The bytes to read.
|
87
|
+
* @param n The number of bytes that can be read.
|
88
|
+
* @returns The number of bytes that the next character takes if it is valid in
|
89
|
+
* the encoding, or 0 if it is not.
|
90
|
+
*/
|
91
|
+
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
|
92
|
+
|
93
|
+
/**
|
94
|
+
* Return the size of the next character in the UTF-8 encoding if it is an
|
95
|
+
* alphanumeric character.
|
96
|
+
*
|
97
|
+
* @param b The bytes to read.
|
98
|
+
* @param n The number of bytes that can be read.
|
99
|
+
* @returns The number of bytes that the next character takes if it is valid in
|
100
|
+
* the encoding, or 0 if it is not.
|
101
|
+
*/
|
102
|
+
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
|
103
|
+
|
104
|
+
/**
|
105
|
+
* Return true if the next character in the UTF-8 encoding if it is an uppercase
|
106
|
+
* character.
|
107
|
+
*
|
108
|
+
* @param b The bytes to read.
|
109
|
+
* @param n The number of bytes that can be read.
|
110
|
+
* @returns True if the next character is valid in the encoding and is an
|
111
|
+
* uppercase character, or false if it is not.
|
112
|
+
*/
|
113
|
+
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
|
114
|
+
|
115
|
+
/**
|
116
|
+
* This lookup table is referenced in both the UTF-8 encoding file and the
|
117
|
+
* parser directly in order to speed up the default encoding processing. It is
|
118
|
+
* used to indicate whether a character is alphabetical, alphanumeric, or
|
119
|
+
* uppercase in unicode mappings.
|
120
|
+
*/
|
121
|
+
extern const uint8_t pm_encoding_unicode_table[256];
|
122
|
+
|
123
|
+
/**
|
124
|
+
* These are all of the encodings that prism supports.
|
125
|
+
*/
|
126
|
+
typedef enum {
|
127
|
+
PM_ENCODING_UTF_8 = 0,
|
128
|
+
PM_ENCODING_ASCII_8BIT,
|
129
|
+
PM_ENCODING_BIG5,
|
130
|
+
PM_ENCODING_BIG5_HKSCS,
|
131
|
+
PM_ENCODING_BIG5_UAO,
|
132
|
+
PM_ENCODING_CESU_8,
|
133
|
+
PM_ENCODING_CP51932,
|
134
|
+
PM_ENCODING_CP850,
|
135
|
+
PM_ENCODING_CP852,
|
136
|
+
PM_ENCODING_CP855,
|
137
|
+
PM_ENCODING_CP949,
|
138
|
+
PM_ENCODING_CP950,
|
139
|
+
PM_ENCODING_CP951,
|
140
|
+
PM_ENCODING_EMACS_MULE,
|
141
|
+
PM_ENCODING_EUC_JP,
|
142
|
+
PM_ENCODING_EUC_JP_MS,
|
143
|
+
PM_ENCODING_EUC_JIS_2004,
|
144
|
+
PM_ENCODING_EUC_KR,
|
145
|
+
PM_ENCODING_EUC_TW,
|
146
|
+
PM_ENCODING_GB12345,
|
147
|
+
PM_ENCODING_GB18030,
|
148
|
+
PM_ENCODING_GB1988,
|
149
|
+
PM_ENCODING_GB2312,
|
150
|
+
PM_ENCODING_GBK,
|
151
|
+
PM_ENCODING_IBM437,
|
152
|
+
PM_ENCODING_IBM720,
|
153
|
+
PM_ENCODING_IBM737,
|
154
|
+
PM_ENCODING_IBM775,
|
155
|
+
PM_ENCODING_IBM852,
|
156
|
+
PM_ENCODING_IBM855,
|
157
|
+
PM_ENCODING_IBM857,
|
158
|
+
PM_ENCODING_IBM860,
|
159
|
+
PM_ENCODING_IBM861,
|
160
|
+
PM_ENCODING_IBM862,
|
161
|
+
PM_ENCODING_IBM863,
|
162
|
+
PM_ENCODING_IBM864,
|
163
|
+
PM_ENCODING_IBM865,
|
164
|
+
PM_ENCODING_IBM866,
|
165
|
+
PM_ENCODING_IBM869,
|
166
|
+
PM_ENCODING_ISO_8859_1,
|
167
|
+
PM_ENCODING_ISO_8859_2,
|
168
|
+
PM_ENCODING_ISO_8859_3,
|
169
|
+
PM_ENCODING_ISO_8859_4,
|
170
|
+
PM_ENCODING_ISO_8859_5,
|
171
|
+
PM_ENCODING_ISO_8859_6,
|
172
|
+
PM_ENCODING_ISO_8859_7,
|
173
|
+
PM_ENCODING_ISO_8859_8,
|
174
|
+
PM_ENCODING_ISO_8859_9,
|
175
|
+
PM_ENCODING_ISO_8859_10,
|
176
|
+
PM_ENCODING_ISO_8859_11,
|
177
|
+
PM_ENCODING_ISO_8859_13,
|
178
|
+
PM_ENCODING_ISO_8859_14,
|
179
|
+
PM_ENCODING_ISO_8859_15,
|
180
|
+
PM_ENCODING_ISO_8859_16,
|
181
|
+
PM_ENCODING_KOI8_R,
|
182
|
+
PM_ENCODING_KOI8_U,
|
183
|
+
PM_ENCODING_MAC_CENT_EURO,
|
184
|
+
PM_ENCODING_MAC_CROATIAN,
|
185
|
+
PM_ENCODING_MAC_CYRILLIC,
|
186
|
+
PM_ENCODING_MAC_GREEK,
|
187
|
+
PM_ENCODING_MAC_ICELAND,
|
188
|
+
PM_ENCODING_MAC_JAPANESE,
|
189
|
+
PM_ENCODING_MAC_ROMAN,
|
190
|
+
PM_ENCODING_MAC_ROMANIA,
|
191
|
+
PM_ENCODING_MAC_THAI,
|
192
|
+
PM_ENCODING_MAC_TURKISH,
|
193
|
+
PM_ENCODING_MAC_UKRAINE,
|
194
|
+
PM_ENCODING_SHIFT_JIS,
|
195
|
+
PM_ENCODING_SJIS_DOCOMO,
|
196
|
+
PM_ENCODING_SJIS_KDDI,
|
197
|
+
PM_ENCODING_SJIS_SOFTBANK,
|
198
|
+
PM_ENCODING_STATELESS_ISO_2022_JP,
|
199
|
+
PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
|
200
|
+
PM_ENCODING_TIS_620,
|
201
|
+
PM_ENCODING_US_ASCII,
|
202
|
+
PM_ENCODING_UTF8_MAC,
|
203
|
+
PM_ENCODING_UTF8_DOCOMO,
|
204
|
+
PM_ENCODING_UTF8_KDDI,
|
205
|
+
PM_ENCODING_UTF8_SOFTBANK,
|
206
|
+
PM_ENCODING_WINDOWS_1250,
|
207
|
+
PM_ENCODING_WINDOWS_1251,
|
208
|
+
PM_ENCODING_WINDOWS_1252,
|
209
|
+
PM_ENCODING_WINDOWS_1253,
|
210
|
+
PM_ENCODING_WINDOWS_1254,
|
211
|
+
PM_ENCODING_WINDOWS_1255,
|
212
|
+
PM_ENCODING_WINDOWS_1256,
|
213
|
+
PM_ENCODING_WINDOWS_1257,
|
214
|
+
PM_ENCODING_WINDOWS_1258,
|
215
|
+
PM_ENCODING_WINDOWS_31J,
|
216
|
+
PM_ENCODING_WINDOWS_874,
|
217
|
+
PM_ENCODING_MAXIMUM
|
218
|
+
} pm_encoding_type_t;
|
219
|
+
|
220
|
+
/**
|
221
|
+
* This is the table of all of the encodings that prism supports.
|
222
|
+
*/
|
223
|
+
extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
|
224
|
+
|
225
|
+
/**
|
226
|
+
* This is the default UTF-8 encoding. We need a reference to it to quickly
|
227
|
+
* create parsers.
|
228
|
+
*/
|
229
|
+
#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])
|
230
|
+
|
231
|
+
/**
|
232
|
+
* This is the US-ASCII encoding. We need a reference to it to be able to
|
233
|
+
* compare against it when a string is being created because it could possibly
|
234
|
+
* need to fall back to ASCII-8BIT.
|
235
|
+
*/
|
236
|
+
#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
|
237
|
+
|
238
|
+
/**
|
239
|
+
* Parse the given name of an encoding and return a pointer to the corresponding
|
240
|
+
* encoding struct if one can be found, otherwise return NULL.
|
241
|
+
*
|
242
|
+
* @param start A pointer to the first byte of the name.
|
243
|
+
* @param end A pointer to the last byte of the name.
|
244
|
+
* @returns A pointer to the encoding struct if one is found, otherwise NULL.
|
245
|
+
*/
|
246
|
+
const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end);
|
247
|
+
|
248
|
+
#endif
|
data/include/prism/options.h
CHANGED
@@ -35,7 +35,7 @@ typedef struct {
|
|
35
35
|
* The line within the file that the parse starts on. This value is
|
36
36
|
* 0-indexed.
|
37
37
|
*/
|
38
|
-
|
38
|
+
int32_t line;
|
39
39
|
|
40
40
|
/**
|
41
41
|
* The name of the encoding that the source file is in. Note that this must
|
@@ -80,7 +80,7 @@ PRISM_EXPORTED_FUNCTION void pm_options_filepath_set(pm_options_t *options, cons
|
|
80
80
|
* @param options The options struct to set the line on.
|
81
81
|
* @param line The line to set.
|
82
82
|
*/
|
83
|
-
PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options,
|
83
|
+
PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t line);
|
84
84
|
|
85
85
|
/**
|
86
86
|
* Set the encoding option on the given options struct.
|
data/include/prism/parser.h
CHANGED
@@ -8,7 +8,7 @@
|
|
8
8
|
|
9
9
|
#include "prism/ast.h"
|
10
10
|
#include "prism/defines.h"
|
11
|
-
#include "prism/
|
11
|
+
#include "prism/encoding.h"
|
12
12
|
#include "prism/util/pm_constant_pool.h"
|
13
13
|
#include "prism/util/pm_list.h"
|
14
14
|
#include "prism/util/pm_newline_list.h"
|
@@ -17,6 +17,12 @@
|
|
17
17
|
|
18
18
|
#include <stdbool.h>
|
19
19
|
|
20
|
+
// TODO: remove this by renaming the original flag
|
21
|
+
/**
|
22
|
+
* Temporary alias for the PM_NODE_FLAG_STATIC_KEYS flag.
|
23
|
+
*/
|
24
|
+
#define PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS PM_KEYWORD_HASH_NODE_FLAGS_STATIC_KEYS
|
25
|
+
|
20
26
|
/**
|
21
27
|
* This enum provides various bits that represent different kinds of states that
|
22
28
|
* the lexer can track. This is used to determine which kind of token to return
|
@@ -297,6 +303,9 @@ typedef enum {
|
|
297
303
|
/** an ensure statement */
|
298
304
|
PM_CONTEXT_ENSURE,
|
299
305
|
|
306
|
+
/** an ensure statement within a method definition */
|
307
|
+
PM_CONTEXT_ENSURE_DEF,
|
308
|
+
|
300
309
|
/** a for loop */
|
301
310
|
PM_CONTEXT_FOR,
|
302
311
|
|
@@ -333,9 +342,15 @@ typedef enum {
|
|
333
342
|
/** a rescue else statement */
|
334
343
|
PM_CONTEXT_RESCUE_ELSE,
|
335
344
|
|
345
|
+
/** a rescue else statement within a method definition */
|
346
|
+
PM_CONTEXT_RESCUE_ELSE_DEF,
|
347
|
+
|
336
348
|
/** a rescue statement */
|
337
349
|
PM_CONTEXT_RESCUE,
|
338
350
|
|
351
|
+
/** a rescue statement within a method definition */
|
352
|
+
PM_CONTEXT_RESCUE_DEF,
|
353
|
+
|
339
354
|
/** a singleton class definition */
|
340
355
|
PM_CONTEXT_SCLASS,
|
341
356
|
|
@@ -361,8 +376,7 @@ typedef struct pm_context_node {
|
|
361
376
|
/** This is the type of a comment that we've found while parsing. */
|
362
377
|
typedef enum {
|
363
378
|
PM_COMMENT_INLINE,
|
364
|
-
PM_COMMENT_EMBDOC
|
365
|
-
PM_COMMENT___END__
|
379
|
+
PM_COMMENT_EMBDOC
|
366
380
|
} pm_comment_type_t;
|
367
381
|
|
368
382
|
/**
|
@@ -374,11 +388,8 @@ typedef struct pm_comment {
|
|
374
388
|
/** The embedded base node. */
|
375
389
|
pm_list_node_t node;
|
376
390
|
|
377
|
-
/**
|
378
|
-
|
379
|
-
|
380
|
-
/** A pointer to the end of the comment in the source. */
|
381
|
-
const uint8_t *end;
|
391
|
+
/** The location of the comment in the source. */
|
392
|
+
pm_location_t location;
|
382
393
|
|
383
394
|
/** The type of comment that we've found. */
|
384
395
|
pm_comment_type_t type;
|
@@ -413,14 +424,6 @@ typedef struct {
|
|
413
424
|
*/
|
414
425
|
typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser);
|
415
426
|
|
416
|
-
/**
|
417
|
-
* When an encoding is encountered that isn't understood by prism, we provide
|
418
|
-
* the ability here to call out to a user-defined function to get an encoding
|
419
|
-
* struct. If the function returns something that isn't NULL, we set that to
|
420
|
-
* our encoding and use it to parse identifiers.
|
421
|
-
*/
|
422
|
-
typedef pm_encoding_t *(*pm_encoding_decode_callback_t)(pm_parser_t *parser, const uint8_t *name, size_t width);
|
423
|
-
|
424
427
|
/**
|
425
428
|
* When you are lexing through a file, the lexer needs all of the information
|
426
429
|
* that the parser additionally provides (for example, the local table). So if
|
@@ -469,18 +472,12 @@ typedef struct pm_scope {
|
|
469
472
|
bool explicit_params;
|
470
473
|
|
471
474
|
/**
|
472
|
-
*
|
475
|
+
* An integer indicating the number of numbered parameters on this scope.
|
473
476
|
* This is necessary to determine if child blocks are allowed to use
|
474
|
-
* numbered parameters
|
475
|
-
|
476
|
-
bool numbered_params;
|
477
|
-
|
478
|
-
/**
|
479
|
-
* A transparent scope is a scope that cannot have locals set on itself.
|
480
|
-
* When a local is set on this scope, it will instead be set on the parent
|
481
|
-
* scope's local table.
|
477
|
+
* numbered parameters, and to pass information to consumers of the AST
|
478
|
+
* about how many numbered parameters exist.
|
482
479
|
*/
|
483
|
-
|
480
|
+
uint8_t numbered_parameters;
|
484
481
|
} pm_scope_t;
|
485
482
|
|
486
483
|
/**
|
@@ -532,12 +529,6 @@ struct pm_parser {
|
|
532
529
|
size_t index;
|
533
530
|
} lex_modes;
|
534
531
|
|
535
|
-
/**
|
536
|
-
* The common_whitespace value from the most-recently-popped heredoc mode of the lexer, so we
|
537
|
-
* can dedent the heredoc after popping the lex mode.
|
538
|
-
*/
|
539
|
-
size_t current_string_common_whitespace;
|
540
|
-
|
541
532
|
/** The pointer to the start of the source. */
|
542
533
|
const uint8_t *start;
|
543
534
|
|
@@ -571,6 +562,9 @@ struct pm_parser {
|
|
571
562
|
/** The list of magic comments that have been found while parsing. */
|
572
563
|
pm_list_t magic_comment_list;
|
573
564
|
|
565
|
+
/** The optional location of the __END__ keyword and its contents. */
|
566
|
+
pm_location_t data_loc;
|
567
|
+
|
574
568
|
/** The list of warnings that have been found while parsing. */
|
575
569
|
pm_list_t warning_list;
|
576
570
|
|
@@ -587,7 +581,7 @@ struct pm_parser {
|
|
587
581
|
* The encoding functions for the current file is attached to the parser as
|
588
582
|
* it's parsing so that it can change with a magic comment.
|
589
583
|
*/
|
590
|
-
pm_encoding_t encoding;
|
584
|
+
const pm_encoding_t *encoding;
|
591
585
|
|
592
586
|
/**
|
593
587
|
* When the encoding that is being used to parse the source is changed by
|
@@ -596,14 +590,6 @@ struct pm_parser {
|
|
596
590
|
*/
|
597
591
|
pm_encoding_changed_callback_t encoding_changed_callback;
|
598
592
|
|
599
|
-
/**
|
600
|
-
* When an encoding is encountered that isn't understood by prism, we
|
601
|
-
* provide the ability here to call out to a user-defined function to get an
|
602
|
-
* encoding struct. If the function returns something that isn't NULL, we
|
603
|
-
* set that to our encoding and use it to parse identifiers.
|
604
|
-
*/
|
605
|
-
pm_encoding_decode_callback_t encoding_decode_callback;
|
606
|
-
|
607
593
|
/**
|
608
594
|
* This pointer indicates where a comment must start if it is to be
|
609
595
|
* considered an encoding comment.
|
@@ -649,7 +635,38 @@ struct pm_parser {
|
|
649
635
|
* The line number at the start of the parse. This will be used to offset
|
650
636
|
* the line numbers of all of the locations.
|
651
637
|
*/
|
652
|
-
|
638
|
+
int32_t start_line;
|
639
|
+
|
640
|
+
/**
|
641
|
+
* When a string-like expression is being lexed, any byte or escape sequence
|
642
|
+
* that resolves to a value whose top bit is set (i.e., >= 0x80) will
|
643
|
+
* explicitly set the encoding to the same encoding as the source.
|
644
|
+
* Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
|
645
|
+
* resolves to a value whose top bit is set, then the encoding will be
|
646
|
+
* explicitly set to UTF-8.
|
647
|
+
*
|
648
|
+
* The _next_ time this happens, if the encoding that is about to become the
|
649
|
+
* explicitly set encoding does not match the previously set explicit
|
650
|
+
* encoding, a mixed encoding error will be emitted.
|
651
|
+
*
|
652
|
+
* When the expression is finished being lexed, the explicit encoding
|
653
|
+
* controls the encoding of the expression. For the most part this means
|
654
|
+
* that the expression will either be encoded in the source encoding or
|
655
|
+
* UTF-8. This holds for all encodings except US-ASCII. If the source is
|
656
|
+
* US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
|
657
|
+
* expression will be encoded as ASCII-8BIT.
|
658
|
+
*
|
659
|
+
* Note that if the expression is a list, different elements within the same
|
660
|
+
* list can have different encodings, so this will get reset between each
|
661
|
+
* element. Furthermore all of this only applies to lists that support
|
662
|
+
* interpolation, because otherwise escapes that could change the encoding
|
663
|
+
* are ignored.
|
664
|
+
*
|
665
|
+
* At first glance, it may make more sense for this to live on the lexer
|
666
|
+
* mode, but we need it here to communicate back to the parser for character
|
667
|
+
* literals that do not push a new lexer mode.
|
668
|
+
*/
|
669
|
+
const pm_encoding_t *explicit_encoding;
|
653
670
|
|
654
671
|
/** Whether or not we're at the beginning of a command. */
|
655
672
|
bool command_start;
|
@@ -673,6 +690,9 @@ struct pm_parser {
|
|
673
690
|
/** This flag indicates that we are currently parsing a keyword argument. */
|
674
691
|
bool in_keyword_arg;
|
675
692
|
|
693
|
+
/** The current parameter name id on parsing its default value. */
|
694
|
+
pm_constant_id_t current_param_name;
|
695
|
+
|
676
696
|
/**
|
677
697
|
* Whether or not the parser has seen a token that has semantic meaning
|
678
698
|
* (i.e., a token that is not a comment or whitespace).
|
data/include/prism/regexp.h
CHANGED
@@ -8,7 +8,7 @@
|
|
8
8
|
|
9
9
|
#include "prism/defines.h"
|
10
10
|
#include "prism/parser.h"
|
11
|
-
#include "prism/
|
11
|
+
#include "prism/encoding.h"
|
12
12
|
#include "prism/util/pm_memchr.h"
|
13
13
|
#include "prism/util/pm_string_list.h"
|
14
14
|
#include "prism/util/pm_string.h"
|
@@ -28,6 +28,6 @@
|
|
28
28
|
* @param encoding The encoding of the source code.
|
29
29
|
* @return Whether or not the parsing was successful.
|
30
30
|
*/
|
31
|
-
PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding);
|
31
|
+
PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding);
|
32
32
|
|
33
33
|
#endif
|
@@ -118,7 +118,15 @@ void pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value);
|
|
118
118
|
* @param buffer The buffer to append to.
|
119
119
|
* @param value The integer to append.
|
120
120
|
*/
|
121
|
-
void
|
121
|
+
void pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value);
|
122
|
+
|
123
|
+
/**
|
124
|
+
* Append a 32-bit signed integer to the buffer as a variable-length integer.
|
125
|
+
*
|
126
|
+
* @param buffer The buffer to append to.
|
127
|
+
* @param value The integer to append.
|
128
|
+
*/
|
129
|
+
void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value);
|
122
130
|
|
123
131
|
/**
|
124
132
|
* Concatenate one buffer onto another.
|
@@ -7,7 +7,7 @@
|
|
7
7
|
#define PRISM_MEMCHR_H
|
8
8
|
|
9
9
|
#include "prism/defines.h"
|
10
|
-
#include "prism/
|
10
|
+
#include "prism/encoding.h"
|
11
11
|
|
12
12
|
#include <stddef.h>
|
13
13
|
|
@@ -24,6 +24,6 @@
|
|
24
24
|
* @return A pointer to the first occurrence of the character in the source
|
25
25
|
* string, or NULL if no such character exists.
|
26
26
|
*/
|
27
|
-
void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, pm_encoding_t *encoding);
|
27
|
+
void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding);
|
28
28
|
|
29
29
|
#endif
|
@@ -32,12 +32,12 @@
|
|
32
32
|
* need to take a slower path and iterate one multi-byte character at a time.
|
33
33
|
*
|
34
34
|
* @param parser The parser.
|
35
|
-
* @param source The source
|
35
|
+
* @param source The source to search.
|
36
36
|
* @param charset The charset to search for.
|
37
|
-
* @param length The maximum
|
37
|
+
* @param length The maximum number of bytes to search.
|
38
38
|
* @return A pointer to the first character in the source string that is in the
|
39
39
|
* charset, or NULL if no such character exists.
|
40
40
|
*/
|
41
|
-
const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);
|
41
|
+
const uint8_t * pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);
|
42
42
|
|
43
43
|
#endif
|
data/include/prism/version.h
CHANGED
@@ -14,7 +14,7 @@
|
|
14
14
|
/**
|
15
15
|
* The minor version of the Prism library as an int.
|
16
16
|
*/
|
17
|
-
#define PRISM_VERSION_MINOR
|
17
|
+
#define PRISM_VERSION_MINOR 19
|
18
18
|
|
19
19
|
/**
|
20
20
|
* The patch version of the Prism library as an int.
|
@@ -24,6 +24,6 @@
|
|
24
24
|
/**
|
25
25
|
* The version of the Prism library as a constant string.
|
26
26
|
*/
|
27
|
-
#define PRISM_VERSION "0.
|
27
|
+
#define PRISM_VERSION "0.19.0"
|
28
28
|
|
29
29
|
#endif
|