yarp 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -1
- data/Makefile +5 -1
- data/README.md +4 -3
- data/config.yml +461 -150
- data/docs/configuration.md +1 -0
- data/docs/encoding.md +5 -5
- data/docs/ruby_api.md +2 -0
- data/docs/serialization.md +3 -3
- data/docs/testing.md +2 -2
- data/ext/yarp/api_node.c +810 -199
- data/ext/yarp/extension.c +94 -31
- data/ext/yarp/extension.h +2 -2
- data/include/yarp/ast.h +653 -150
- data/include/yarp/defines.h +2 -1
- data/include/yarp/diagnostic.h +3 -3
- data/include/yarp/enc/yp_encoding.h +10 -10
- data/include/yarp/node.h +10 -0
- data/include/yarp/parser.h +19 -19
- data/include/yarp/regexp.h +1 -1
- data/include/yarp/unescape.h +7 -5
- data/include/yarp/util/yp_buffer.h +3 -0
- data/include/yarp/util/yp_char.h +16 -16
- data/include/yarp/util/yp_constant_pool.h +2 -2
- data/include/yarp/util/yp_newline_list.h +7 -4
- data/include/yarp/util/yp_string.h +4 -4
- data/include/yarp/util/yp_string_list.h +0 -3
- data/include/yarp/util/yp_strpbrk.h +1 -1
- data/include/yarp/version.h +2 -2
- data/include/yarp.h +14 -3
- data/lib/yarp/desugar_visitor.rb +204 -0
- data/lib/yarp/ffi.rb +27 -1
- data/lib/yarp/lex_compat.rb +93 -25
- data/lib/yarp/mutation_visitor.rb +683 -0
- data/lib/yarp/node.rb +3121 -597
- data/lib/yarp/serialize.rb +198 -126
- data/lib/yarp.rb +53 -7
- data/src/diagnostic.c +1 -1
- data/src/enc/yp_big5.c +15 -42
- data/src/enc/yp_euc_jp.c +16 -43
- data/src/enc/yp_gbk.c +19 -46
- data/src/enc/yp_shift_jis.c +16 -43
- data/src/enc/yp_tables.c +36 -38
- data/src/enc/yp_unicode.c +20 -25
- data/src/enc/yp_windows_31j.c +16 -43
- data/src/node.c +1444 -836
- data/src/prettyprint.c +324 -103
- data/src/regexp.c +21 -21
- data/src/serialize.c +429 -276
- data/src/token_type.c +2 -2
- data/src/unescape.c +184 -136
- data/src/util/yp_buffer.c +7 -2
- data/src/util/yp_char.c +34 -34
- data/src/util/yp_constant_pool.c +4 -4
- data/src/util/yp_memchr.c +1 -1
- data/src/util/yp_newline_list.c +14 -3
- data/src/util/yp_string.c +22 -20
- data/src/util/yp_string_list.c +0 -6
- data/src/util/yp_strncasecmp.c +3 -6
- data/src/util/yp_strpbrk.c +8 -8
- data/src/yarp.c +1504 -615
- data/yarp.gemspec +3 -1
- metadata +4 -2
data/src/token_type.c
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
/******************************************************************************/
|
2
|
-
/* This file is generated by the
|
3
|
-
/* modified manually. See
|
2
|
+
/* This file is generated by the templates/template.rb script and should not */
|
3
|
+
/* be modified manually. See */
|
4
4
|
/* templates/src/token_type.c.erb */
|
5
5
|
/* if you are looking to modify the */
|
6
6
|
/* template */
|
data/src/unescape.c
CHANGED
@@ -5,21 +5,33 @@
|
|
5
5
|
/******************************************************************************/
|
6
6
|
|
7
7
|
static inline bool
|
8
|
-
yp_char_is_hexadecimal_digits(const
|
8
|
+
yp_char_is_hexadecimal_digits(const uint8_t *string, size_t length) {
|
9
9
|
for (size_t index = 0; index < length; index++) {
|
10
|
-
if (!yp_char_is_hexadecimal_digit(
|
10
|
+
if (!yp_char_is_hexadecimal_digit(string[index])) {
|
11
11
|
return false;
|
12
12
|
}
|
13
13
|
}
|
14
14
|
return true;
|
15
15
|
}
|
16
16
|
|
17
|
+
// We don't call the char_width function unless we have to because it's
|
18
|
+
// expensive to go through the indirection of the function pointer. Instead we
|
19
|
+
// provide a fast path that will check if we can just return 1.
|
20
|
+
static inline size_t
|
21
|
+
yp_char_width(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
22
|
+
if (parser->encoding_changed || (*start >= 0x80)) {
|
23
|
+
return parser->encoding.char_width(start, end - start);
|
24
|
+
} else {
|
25
|
+
return 1;
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
17
29
|
/******************************************************************************/
|
18
30
|
/* Lookup tables for characters */
|
19
31
|
/******************************************************************************/
|
20
32
|
|
21
33
|
// This is a lookup table for unescapes that only take up a single character.
|
22
|
-
static const
|
34
|
+
static const uint8_t unescape_chars[] = {
|
23
35
|
['\''] = '\'',
|
24
36
|
['\\'] = '\\',
|
25
37
|
['a'] = '\a',
|
@@ -46,9 +58,8 @@ static const bool ascii_printable_chars[] = {
|
|
46
58
|
};
|
47
59
|
|
48
60
|
static inline bool
|
49
|
-
char_is_ascii_printable(const
|
50
|
-
|
51
|
-
return (v < 0x80) && ascii_printable_chars[v];
|
61
|
+
char_is_ascii_printable(const uint8_t b) {
|
62
|
+
return (b < 0x80) && ascii_printable_chars[b];
|
52
63
|
}
|
53
64
|
|
54
65
|
/******************************************************************************/
|
@@ -58,37 +69,39 @@ char_is_ascii_printable(const char c) {
|
|
58
69
|
// Scan the 1-3 digits of octal into the value. Returns the number of digits
|
59
70
|
// scanned.
|
60
71
|
static inline size_t
|
61
|
-
unescape_octal(const
|
62
|
-
*value = (
|
63
|
-
if (!yp_char_is_octal_digit(backslash[2])) {
|
72
|
+
unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
|
73
|
+
*value = (uint8_t) (backslash[1] - '0');
|
74
|
+
if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
|
64
75
|
return 2;
|
65
76
|
}
|
66
|
-
|
67
|
-
|
68
|
-
if (!yp_char_is_octal_digit(backslash[3])) {
|
77
|
+
*value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
|
78
|
+
if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
|
69
79
|
return 3;
|
70
80
|
}
|
71
|
-
|
72
|
-
*value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
|
81
|
+
*value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
|
73
82
|
return 4;
|
74
83
|
}
|
75
84
|
|
76
85
|
// Convert a hexadecimal digit into its equivalent value.
|
77
|
-
static inline
|
78
|
-
unescape_hexadecimal_digit(const
|
79
|
-
return (
|
86
|
+
static inline uint8_t
|
87
|
+
unescape_hexadecimal_digit(const uint8_t value) {
|
88
|
+
return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
|
80
89
|
}
|
81
90
|
|
82
91
|
// Scan the 1-2 digits of hexadecimal into the value. Returns the number of
|
83
92
|
// digits scanned.
|
84
93
|
static inline size_t
|
85
|
-
unescape_hexadecimal(const
|
94
|
+
unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end, yp_list_t *error_list) {
|
95
|
+
*value = 0;
|
96
|
+
if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
|
97
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid hex escape.");
|
98
|
+
return 2;
|
99
|
+
}
|
86
100
|
*value = unescape_hexadecimal_digit(backslash[2]);
|
87
|
-
if (!yp_char_is_hexadecimal_digit(backslash[3])) {
|
101
|
+
if (backslash + 3 >= end || !yp_char_is_hexadecimal_digit(backslash[3])) {
|
88
102
|
return 3;
|
89
103
|
}
|
90
|
-
|
91
|
-
*value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
|
104
|
+
*value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
|
92
105
|
return 4;
|
93
106
|
}
|
94
107
|
|
@@ -96,7 +109,7 @@ unescape_hexadecimal(const char *backslash, unsigned char *value) {
|
|
96
109
|
// digits scanned. This function assumes that the characters have already been
|
97
110
|
// validated.
|
98
111
|
static inline void
|
99
|
-
unescape_unicode(const
|
112
|
+
unescape_unicode(const uint8_t *string, size_t length, uint32_t *value) {
|
100
113
|
*value = 0;
|
101
114
|
for (size_t index = 0; index < length; index++) {
|
102
115
|
if (index != 0) *value <<= 4;
|
@@ -108,27 +121,25 @@ unescape_unicode(const char *string, size_t length, uint32_t *value) {
|
|
108
121
|
// 32-bit value to write. Writes the UTF-8 representation of the value to the
|
109
122
|
// string and returns the number of bytes written.
|
110
123
|
static inline size_t
|
111
|
-
unescape_unicode_write(
|
112
|
-
unsigned char *bytes = (unsigned char *) dest;
|
113
|
-
|
124
|
+
unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, const uint8_t *end, yp_list_t *error_list) {
|
114
125
|
if (value <= 0x7F) {
|
115
126
|
// 0xxxxxxx
|
116
|
-
|
127
|
+
dest[0] = (uint8_t) value;
|
117
128
|
return 1;
|
118
129
|
}
|
119
130
|
|
120
131
|
if (value <= 0x7FF) {
|
121
132
|
// 110xxxxx 10xxxxxx
|
122
|
-
|
123
|
-
|
133
|
+
dest[0] = (uint8_t) (0xC0 | (value >> 6));
|
134
|
+
dest[1] = (uint8_t) (0x80 | (value & 0x3F));
|
124
135
|
return 2;
|
125
136
|
}
|
126
137
|
|
127
138
|
if (value <= 0xFFFF) {
|
128
139
|
// 1110xxxx 10xxxxxx 10xxxxxx
|
129
|
-
|
130
|
-
|
131
|
-
|
140
|
+
dest[0] = (uint8_t) (0xE0 | (value >> 12));
|
141
|
+
dest[1] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
|
142
|
+
dest[2] = (uint8_t) (0x80 | (value & 0x3F));
|
132
143
|
return 3;
|
133
144
|
}
|
134
145
|
|
@@ -136,20 +147,20 @@ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char
|
|
136
147
|
// the input is invalid.
|
137
148
|
if (value <= 0x10FFFF) {
|
138
149
|
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
150
|
+
dest[0] = (uint8_t) (0xF0 | (value >> 18));
|
151
|
+
dest[1] = (uint8_t) (0x80 | ((value >> 12) & 0x3F));
|
152
|
+
dest[2] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
|
153
|
+
dest[3] = (uint8_t) (0x80 | (value & 0x3F));
|
143
154
|
return 4;
|
144
155
|
}
|
145
156
|
|
146
157
|
// If we get here, then the value is too big. This is an error, but we don't
|
147
158
|
// want to just crash, so instead we'll add an error to the error list and put
|
148
159
|
// in a replacement character instead.
|
149
|
-
yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
|
150
|
-
|
151
|
-
|
152
|
-
|
160
|
+
if (error_list) yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
|
161
|
+
dest[0] = 0xEF;
|
162
|
+
dest[1] = 0xBF;
|
163
|
+
dest[2] = 0xBD;
|
153
164
|
return 3;
|
154
165
|
}
|
155
166
|
|
@@ -161,24 +172,30 @@ typedef enum {
|
|
161
172
|
} yp_unescape_flag_t;
|
162
173
|
|
163
174
|
// Unescape a single character value based on the given flags.
|
164
|
-
static inline
|
165
|
-
unescape_char(
|
166
|
-
unsigned char unescaped = value;
|
167
|
-
|
175
|
+
static inline uint8_t
|
176
|
+
unescape_char(uint8_t value, const uint8_t flags) {
|
168
177
|
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
|
169
|
-
|
178
|
+
value &= 0x1f;
|
170
179
|
}
|
171
180
|
|
172
181
|
if (flags & YP_UNESCAPE_FLAG_META) {
|
173
|
-
|
182
|
+
value |= 0x80;
|
174
183
|
}
|
175
184
|
|
176
|
-
return
|
185
|
+
return value;
|
177
186
|
}
|
178
187
|
|
179
188
|
// Read a specific escape sequence into the given destination.
|
180
|
-
static const
|
181
|
-
unescape(
|
189
|
+
static const uint8_t *
|
190
|
+
unescape(
|
191
|
+
yp_parser_t *parser,
|
192
|
+
uint8_t *dest,
|
193
|
+
size_t *dest_length,
|
194
|
+
const uint8_t *backslash,
|
195
|
+
const uint8_t *end,
|
196
|
+
const uint8_t flags,
|
197
|
+
yp_list_t *error_list
|
198
|
+
) {
|
182
199
|
switch (backslash[1]) {
|
183
200
|
case 'a':
|
184
201
|
case 'b':
|
@@ -189,28 +206,28 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
|
|
189
206
|
case 's':
|
190
207
|
case 't':
|
191
208
|
case 'v':
|
192
|
-
if (
|
193
|
-
dest[(*dest_length)++] =
|
209
|
+
if (dest) {
|
210
|
+
dest[(*dest_length)++] = unescape_char(unescape_chars[backslash[1]], flags);
|
194
211
|
}
|
195
212
|
return backslash + 2;
|
196
213
|
// \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
|
197
214
|
case '0': case '1': case '2': case '3': case '4':
|
198
215
|
case '5': case '6': case '7': case '8': case '9': {
|
199
|
-
|
200
|
-
const
|
216
|
+
uint8_t value;
|
217
|
+
const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
|
201
218
|
|
202
|
-
if (
|
203
|
-
dest[(*dest_length)++] =
|
219
|
+
if (dest) {
|
220
|
+
dest[(*dest_length)++] = unescape_char(value, flags);
|
204
221
|
}
|
205
222
|
return cursor;
|
206
223
|
}
|
207
224
|
// \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
|
208
225
|
case 'x': {
|
209
|
-
|
210
|
-
const
|
226
|
+
uint8_t value;
|
227
|
+
const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end, error_list);
|
211
228
|
|
212
|
-
if (
|
213
|
-
dest[(*dest_length)++] =
|
229
|
+
if (dest) {
|
230
|
+
dest[(*dest_length)++] = unescape_char(value, flags);
|
214
231
|
}
|
215
232
|
return cursor;
|
216
233
|
}
|
@@ -218,28 +235,28 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
|
|
218
235
|
// \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
|
219
236
|
case 'u': {
|
220
237
|
if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
|
221
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
|
238
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
|
222
239
|
return backslash + 2;
|
223
240
|
}
|
224
241
|
|
225
242
|
if ((backslash + 3) < end && backslash[2] == '{') {
|
226
|
-
const
|
227
|
-
const
|
243
|
+
const uint8_t *unicode_cursor = backslash + 3;
|
244
|
+
const uint8_t *extra_codepoints_start = NULL;
|
228
245
|
int codepoints_count = 0;
|
229
246
|
|
230
247
|
unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
|
231
248
|
|
232
|
-
while ((
|
233
|
-
const
|
249
|
+
while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
|
250
|
+
const uint8_t *unicode_start = unicode_cursor;
|
234
251
|
size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
|
235
252
|
|
236
253
|
// \u{nnnn} character literal allows only 1-6 hexadecimal digits
|
237
|
-
if (hexadecimal_length > 6)
|
238
|
-
yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
|
239
|
-
|
254
|
+
if (hexadecimal_length > 6) {
|
255
|
+
if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
|
256
|
+
}
|
240
257
|
// there are not hexadecimal characters
|
241
|
-
if (hexadecimal_length == 0) {
|
242
|
-
yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
|
258
|
+
else if (hexadecimal_length == 0) {
|
259
|
+
if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
|
243
260
|
return unicode_cursor;
|
244
261
|
}
|
245
262
|
|
@@ -251,7 +268,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
|
|
251
268
|
|
252
269
|
uint32_t value;
|
253
270
|
unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
|
254
|
-
if (
|
271
|
+
if (dest) {
|
255
272
|
*dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
|
256
273
|
}
|
257
274
|
|
@@ -259,23 +276,29 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
|
|
259
276
|
}
|
260
277
|
|
261
278
|
// ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
|
262
|
-
if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
|
263
|
-
yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
|
279
|
+
if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) {
|
280
|
+
if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
|
281
|
+
}
|
264
282
|
|
265
|
-
|
266
|
-
|
283
|
+
if (unicode_cursor < end && *unicode_cursor == '}') {
|
284
|
+
unicode_cursor++;
|
285
|
+
} else {
|
286
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, "invalid Unicode escape.");
|
287
|
+
}
|
267
288
|
|
268
|
-
|
289
|
+
return unicode_cursor;
|
290
|
+
}
|
291
|
+
else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
|
269
292
|
uint32_t value;
|
270
293
|
unescape_unicode(backslash + 2, 4, &value);
|
271
294
|
|
272
|
-
if (
|
295
|
+
if (dest) {
|
273
296
|
*dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
|
274
297
|
}
|
275
298
|
return backslash + 6;
|
276
299
|
}
|
277
300
|
|
278
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
|
301
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
|
279
302
|
return backslash + 2;
|
280
303
|
}
|
281
304
|
// \c\M-x meta control character, where x is an ASCII printable character
|
@@ -283,31 +306,31 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
|
|
283
306
|
// \cx control character, where x is an ASCII printable character
|
284
307
|
case 'c':
|
285
308
|
if (backslash + 2 >= end) {
|
286
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
309
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
287
310
|
return end;
|
288
311
|
}
|
289
312
|
|
290
313
|
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
|
291
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
|
314
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
|
292
315
|
return backslash + 2;
|
293
316
|
}
|
294
317
|
|
295
318
|
switch (backslash[2]) {
|
296
319
|
case '\\':
|
297
|
-
return unescape(dest, dest_length, backslash + 2, end,
|
320
|
+
return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
|
298
321
|
case '?':
|
299
|
-
if (
|
300
|
-
dest[(*dest_length)++] =
|
322
|
+
if (dest) {
|
323
|
+
dest[(*dest_length)++] = unescape_char(0x7f, flags);
|
301
324
|
}
|
302
325
|
return backslash + 3;
|
303
326
|
default: {
|
304
327
|
if (!char_is_ascii_printable(backslash[2])) {
|
305
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
328
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
306
329
|
return backslash + 2;
|
307
330
|
}
|
308
331
|
|
309
|
-
if (
|
310
|
-
dest[(*dest_length)++] =
|
332
|
+
if (dest) {
|
333
|
+
dest[(*dest_length)++] = unescape_char(backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
|
311
334
|
}
|
312
335
|
return backslash + 3;
|
313
336
|
}
|
@@ -316,36 +339,36 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
|
|
316
339
|
// \C-? delete, ASCII 7Fh (DEL)
|
317
340
|
case 'C':
|
318
341
|
if (backslash + 3 >= end) {
|
319
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
342
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
320
343
|
return end;
|
321
344
|
}
|
322
345
|
|
323
346
|
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
|
324
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
|
347
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
|
325
348
|
return backslash + 2;
|
326
349
|
}
|
327
350
|
|
328
351
|
if (backslash[2] != '-') {
|
329
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
352
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
330
353
|
return backslash + 2;
|
331
354
|
}
|
332
355
|
|
333
356
|
switch (backslash[3]) {
|
334
357
|
case '\\':
|
335
|
-
return unescape(dest, dest_length, backslash + 3, end,
|
358
|
+
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
|
336
359
|
case '?':
|
337
|
-
if (
|
338
|
-
dest[(*dest_length)++] =
|
360
|
+
if (dest) {
|
361
|
+
dest[(*dest_length)++] = unescape_char(0x7f, flags);
|
339
362
|
}
|
340
363
|
return backslash + 4;
|
341
364
|
default:
|
342
365
|
if (!char_is_ascii_printable(backslash[3])) {
|
343
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
|
366
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
|
344
367
|
return backslash + 2;
|
345
368
|
}
|
346
369
|
|
347
|
-
if (
|
348
|
-
dest[(*dest_length)++] =
|
370
|
+
if (dest) {
|
371
|
+
dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
|
349
372
|
}
|
350
373
|
return backslash + 4;
|
351
374
|
}
|
@@ -354,32 +377,32 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
|
|
354
377
|
// \M-x meta character, where x is an ASCII printable character
|
355
378
|
case 'M': {
|
356
379
|
if (backslash + 3 >= end) {
|
357
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
380
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
358
381
|
return end;
|
359
382
|
}
|
360
383
|
|
361
384
|
if (flags & YP_UNESCAPE_FLAG_META) {
|
362
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
|
385
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
|
363
386
|
return backslash + 2;
|
364
387
|
}
|
365
388
|
|
366
389
|
if (backslash[2] != '-') {
|
367
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
|
390
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
|
368
391
|
return backslash + 2;
|
369
392
|
}
|
370
393
|
|
371
394
|
if (backslash[3] == '\\') {
|
372
|
-
return unescape(dest, dest_length, backslash + 3, end,
|
395
|
+
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list);
|
373
396
|
}
|
374
397
|
|
375
398
|
if (char_is_ascii_printable(backslash[3])) {
|
376
|
-
if (
|
377
|
-
dest[(*dest_length)++] =
|
399
|
+
if (dest) {
|
400
|
+
dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_META);
|
378
401
|
}
|
379
402
|
return backslash + 4;
|
380
403
|
}
|
381
404
|
|
382
|
-
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
|
405
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
|
383
406
|
return backslash + 3;
|
384
407
|
}
|
385
408
|
// \n
|
@@ -390,14 +413,17 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
|
|
390
413
|
if (backslash + 2 < end && backslash[2] == '\n') {
|
391
414
|
return backslash + 3;
|
392
415
|
}
|
393
|
-
|
394
|
-
/* fallthrough */
|
416
|
+
/* fallthrough */
|
395
417
|
// In this case we're escaping something that doesn't need escaping.
|
396
418
|
default: {
|
397
|
-
|
398
|
-
|
419
|
+
size_t width = yp_char_width(parser, backslash + 1, end);
|
420
|
+
|
421
|
+
if (dest) {
|
422
|
+
memcpy(dest + *dest_length, backslash + 1, width);
|
423
|
+
*dest_length += width;
|
399
424
|
}
|
400
|
-
|
425
|
+
|
426
|
+
return backslash + 1 + width;
|
401
427
|
}
|
402
428
|
}
|
403
429
|
}
|
@@ -430,14 +456,14 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
|
|
430
456
|
// \c\M-x same as above
|
431
457
|
// \c? or \C-? delete, ASCII 7Fh (DEL)
|
432
458
|
//
|
433
|
-
|
434
|
-
|
459
|
+
static void
|
460
|
+
yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
|
435
461
|
if (unescape_type == YP_UNESCAPE_NONE) {
|
436
462
|
// If we're not unescaping then we can reference the source directly.
|
437
463
|
return;
|
438
464
|
}
|
439
465
|
|
440
|
-
const
|
466
|
+
const uint8_t *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
|
441
467
|
|
442
468
|
if (backslash == NULL) {
|
443
469
|
// Here there are no escapes, so we can reference the source directly.
|
@@ -446,21 +472,21 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
|
446
472
|
|
447
473
|
// Here we have found an escape character, so we need to handle all escapes
|
448
474
|
// within the string.
|
449
|
-
|
475
|
+
uint8_t *allocated = malloc(string->length);
|
450
476
|
if (allocated == NULL) {
|
451
|
-
yp_diagnostic_list_append(error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
|
477
|
+
yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
|
452
478
|
return;
|
453
479
|
}
|
454
480
|
|
455
481
|
// This is the memory address where we're putting the unescaped string.
|
456
|
-
|
482
|
+
uint8_t *dest = allocated;
|
457
483
|
size_t dest_length = 0;
|
458
484
|
|
459
485
|
// This is the current position in the source string that we're looking at.
|
460
486
|
// It's going to move along behind the backslash so that we can copy each
|
461
487
|
// segment of the string that doesn't contain an escape.
|
462
|
-
const
|
463
|
-
const
|
488
|
+
const uint8_t *cursor = string->source;
|
489
|
+
const uint8_t *end = string->source + string->length;
|
464
490
|
|
465
491
|
// For each escape found in the source string, we will handle it and update
|
466
492
|
// the moving cursor->backslash window.
|
@@ -479,7 +505,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
|
479
505
|
switch (backslash[1]) {
|
480
506
|
case '\\':
|
481
507
|
case '\'':
|
482
|
-
dest[dest_length++] =
|
508
|
+
dest[dest_length++] = unescape_chars[backslash[1]];
|
483
509
|
cursor = backslash + 2;
|
484
510
|
break;
|
485
511
|
default:
|
@@ -493,7 +519,13 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
|
493
519
|
// This is the only type of unescaping left. In this case we need to
|
494
520
|
// handle all of the different unescapes.
|
495
521
|
assert(unescape_type == YP_UNESCAPE_ALL);
|
496
|
-
|
522
|
+
|
523
|
+
uint8_t flags = YP_UNESCAPE_FLAG_NONE;
|
524
|
+
if (expect_single_codepoint) {
|
525
|
+
flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
|
526
|
+
}
|
527
|
+
|
528
|
+
cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list);
|
497
529
|
break;
|
498
530
|
}
|
499
531
|
|
@@ -521,50 +553,66 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
|
521
553
|
yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
|
522
554
|
}
|
523
555
|
|
524
|
-
YP_EXPORTED_FUNCTION
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
yp_parser_t parser;
|
529
|
-
yp_parser_init(&parser, start, length, "");
|
530
|
-
|
531
|
-
yp_list_t error_list = YP_LIST_EMPTY;
|
532
|
-
yp_string_shared_init(result, start, start + length);
|
533
|
-
yp_unescape_manipulate_string(&parser, result, unescape_type, &error_list);
|
534
|
-
success = yp_list_empty_p(&error_list);
|
535
|
-
|
536
|
-
yp_list_free(&error_list);
|
537
|
-
yp_parser_free(&parser);
|
556
|
+
YP_EXPORTED_FUNCTION void
|
557
|
+
yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
|
558
|
+
yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false);
|
559
|
+
}
|
538
560
|
|
539
|
-
|
561
|
+
void
|
562
|
+
yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
|
563
|
+
yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true);
|
540
564
|
}
|
541
565
|
|
542
566
|
// This function is similar to yp_unescape_manipulate_string, except it doesn't
|
543
567
|
// actually perform any string manipulations. Instead, it calculates how long
|
544
568
|
// the unescaped character is, and returns that value
|
545
|
-
|
546
|
-
yp_unescape_calculate_difference(
|
569
|
+
size_t
|
570
|
+
yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
|
547
571
|
assert(unescape_type != YP_UNESCAPE_NONE);
|
548
572
|
|
573
|
+
if (backslash + 1 >= parser->end) {
|
574
|
+
return 0;
|
575
|
+
}
|
576
|
+
|
549
577
|
switch (backslash[1]) {
|
550
578
|
case '\\':
|
551
579
|
case '\'':
|
552
580
|
return 2;
|
553
581
|
default: {
|
554
|
-
if (unescape_type == YP_UNESCAPE_MINIMAL)
|
582
|
+
if (unescape_type == YP_UNESCAPE_MINIMAL) {
|
583
|
+
return 1 + yp_char_width(parser, backslash + 1, parser->end);
|
584
|
+
}
|
555
585
|
|
556
586
|
// This is the only type of unescaping left. In this case we need to
|
557
587
|
// handle all of the different unescapes.
|
558
588
|
assert(unescape_type == YP_UNESCAPE_ALL);
|
559
589
|
|
560
|
-
|
561
|
-
if (expect_single_codepoint)
|
590
|
+
uint8_t flags = YP_UNESCAPE_FLAG_NONE;
|
591
|
+
if (expect_single_codepoint) {
|
562
592
|
flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
|
593
|
+
}
|
563
594
|
|
564
|
-
const
|
595
|
+
const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL);
|
565
596
|
assert(cursor > backslash);
|
566
597
|
|
567
598
|
return (size_t) (cursor - backslash);
|
568
599
|
}
|
569
600
|
}
|
570
601
|
}
|
602
|
+
|
603
|
+
// This is one of the main entry points into the extension. It accepts a source
|
604
|
+
// string, a type of unescaping, and a pointer to a result string. It returns a
|
605
|
+
// boolean indicating whether or not the unescaping was successful.
|
606
|
+
YP_EXPORTED_FUNCTION bool
|
607
|
+
yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
|
608
|
+
yp_parser_t parser;
|
609
|
+
yp_parser_init(&parser, start, length, NULL);
|
610
|
+
|
611
|
+
yp_string_shared_init(result, start, start + length);
|
612
|
+
yp_unescape_manipulate_string(&parser, result, unescape_type);
|
613
|
+
|
614
|
+
bool success = yp_list_empty_p(&parser.error_list);
|
615
|
+
yp_parser_free(&parser);
|
616
|
+
|
617
|
+
return success;
|
618
|
+
}
|