yarp 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -1
- data/Makefile +5 -1
- data/config.yml +156 -125
- data/docs/encoding.md +5 -5
- data/docs/serialization.md +2 -2
- data/ext/yarp/api_node.c +142 -98
- data/ext/yarp/extension.c +21 -7
- data/ext/yarp/extension.h +1 -1
- data/include/yarp/ast.h +327 -18
- data/include/yarp/defines.h +2 -1
- data/include/yarp/diagnostic.h +3 -3
- data/include/yarp/enc/yp_encoding.h +10 -10
- data/include/yarp/parser.h +19 -19
- data/include/yarp/regexp.h +1 -1
- data/include/yarp/unescape.h +4 -4
- data/include/yarp/util/yp_buffer.h +3 -0
- data/include/yarp/util/yp_char.h +16 -16
- data/include/yarp/util/yp_constant_pool.h +2 -2
- data/include/yarp/util/yp_newline_list.h +5 -5
- data/include/yarp/util/yp_string.h +4 -4
- data/include/yarp/util/yp_string_list.h +0 -3
- data/include/yarp/util/yp_strpbrk.h +1 -1
- data/include/yarp/version.h +2 -2
- data/include/yarp.h +5 -4
- data/lib/yarp/desugar_visitor.rb +59 -122
- data/lib/yarp/node.rb +230 -240
- data/lib/yarp/serialize.rb +16 -16
- data/lib/yarp.rb +5 -5
- data/src/diagnostic.c +1 -1
- data/src/enc/yp_big5.c +15 -42
- data/src/enc/yp_euc_jp.c +16 -43
- data/src/enc/yp_gbk.c +19 -46
- data/src/enc/yp_shift_jis.c +16 -43
- data/src/enc/yp_tables.c +36 -38
- data/src/enc/yp_unicode.c +20 -25
- data/src/enc/yp_windows_31j.c +16 -43
- data/src/node.c +1271 -899
- data/src/prettyprint.c +87 -48
- data/src/regexp.c +21 -21
- data/src/serialize.c +28 -15
- data/src/unescape.c +151 -121
- data/src/util/yp_buffer.c +7 -2
- data/src/util/yp_char.c +34 -34
- data/src/util/yp_constant_pool.c +4 -4
- data/src/util/yp_memchr.c +1 -1
- data/src/util/yp_newline_list.c +5 -4
- data/src/util/yp_string.c +22 -20
- data/src/util/yp_string_list.c +0 -6
- data/src/util/yp_strncasecmp.c +3 -6
- data/src/util/yp_strpbrk.c +8 -8
- data/src/yarp.c +355 -216
- data/yarp.gemspec +1 -1
- metadata +2 -2
data/src/unescape.c
CHANGED
@@ -5,9 +5,9 @@
|
|
5
5
|
/******************************************************************************/
|
6
6
|
|
7
7
|
static inline bool
|
8
|
-
yp_char_is_hexadecimal_digits(const
|
8
|
+
yp_char_is_hexadecimal_digits(const uint8_t *string, size_t length) {
|
9
9
|
for (size_t index = 0; index < length; index++) {
|
10
|
-
if (!yp_char_is_hexadecimal_digit(
|
10
|
+
if (!yp_char_is_hexadecimal_digit(string[index])) {
|
11
11
|
return false;
|
12
12
|
}
|
13
13
|
}
|
@@ -18,10 +18,8 @@ yp_char_is_hexadecimal_digits(const char *c, size_t length) {
|
|
18
18
|
// expensive to go through the indirection of the function pointer. Instead we
|
19
19
|
// provide a fast path that will check if we can just return 1.
|
20
20
|
static inline size_t
|
21
|
-
yp_char_width(yp_parser_t *parser, const
|
22
|
-
|
23
|
-
|
24
|
-
if (parser->encoding_changed || (*uc >= 0x80)) {
|
21
|
+
yp_char_width(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
22
|
+
if (parser->encoding_changed || (*start >= 0x80)) {
|
25
23
|
return parser->encoding.char_width(start, end - start);
|
26
24
|
} else {
|
27
25
|
return 1;
|
@@ -33,7 +31,7 @@ yp_char_width(yp_parser_t *parser, const char *start, const char *end) {
|
|
33
31
|
/******************************************************************************/
|
34
32
|
|
35
33
|
// This is a lookup table for unescapes that only take up a single character.
|
36
|
-
static const
|
34
|
+
static const uint8_t unescape_chars[] = {
|
37
35
|
['\''] = '\'',
|
38
36
|
['\\'] = '\\',
|
39
37
|
['a'] = '\a',
|
@@ -60,9 +58,8 @@ static const bool ascii_printable_chars[] = {
|
|
60
58
|
};
|
61
59
|
|
62
60
|
static inline bool
|
63
|
-
char_is_ascii_printable(const
|
64
|
-
|
65
|
-
return (v < 0x80) && ascii_printable_chars[v];
|
61
|
+
char_is_ascii_printable(const uint8_t b) {
|
62
|
+
return (b < 0x80) && ascii_printable_chars[b];
|
66
63
|
}
|
67
64
|
|
68
65
|
/******************************************************************************/
|
@@ -72,37 +69,39 @@ char_is_ascii_printable(const char c) {
|
|
72
69
|
// Scan the 1-3 digits of octal into the value. Returns the number of digits
|
73
70
|
// scanned.
|
74
71
|
static inline size_t
|
75
|
-
unescape_octal(const
|
76
|
-
*value = (
|
77
|
-
if (!yp_char_is_octal_digit(backslash[2])) {
|
72
|
+
unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
|
73
|
+
*value = (uint8_t) (backslash[1] - '0');
|
74
|
+
if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
|
78
75
|
return 2;
|
79
76
|
}
|
80
|
-
|
81
|
-
|
82
|
-
if (!yp_char_is_octal_digit(backslash[3])) {
|
77
|
+
*value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
|
78
|
+
if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
|
83
79
|
return 3;
|
84
80
|
}
|
85
|
-
|
86
|
-
*value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
|
81
|
+
*value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
|
87
82
|
return 4;
|
88
83
|
}
|
89
84
|
|
90
85
|
// Convert a hexadecimal digit into its equivalent value.
|
91
|
-
static inline
|
92
|
-
unescape_hexadecimal_digit(const
|
93
|
-
return (
|
86
|
+
static inline uint8_t
|
87
|
+
unescape_hexadecimal_digit(const uint8_t value) {
|
88
|
+
return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
|
94
89
|
}
|
95
90
|
|
96
91
|
// Scan the 1-2 digits of hexadecimal into the value. Returns the number of
|
97
92
|
// digits scanned.
|
98
93
|
static inline size_t
|
99
|
-
unescape_hexadecimal(const
|
94
|
+
unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end, yp_list_t *error_list) {
|
95
|
+
*value = 0;
|
96
|
+
if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
|
97
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid hex escape.");
|
98
|
+
return 2;
|
99
|
+
}
|
100
100
|
*value = unescape_hexadecimal_digit(backslash[2]);
|
101
|
-
if (!yp_char_is_hexadecimal_digit(backslash[3])) {
|
101
|
+
if (backslash + 3 >= end || !yp_char_is_hexadecimal_digit(backslash[3])) {
|
102
102
|
return 3;
|
103
103
|
}
|
104
|
-
|
105
|
-
*value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
|
104
|
+
*value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
|
106
105
|
return 4;
|
107
106
|
}
|
108
107
|
|
@@ -110,7 +109,7 @@ unescape_hexadecimal(const char *backslash, unsigned char *value) {
|
|
110
109
|
// digits scanned. This function assumes that the characters have already been
|
111
110
|
// validated.
|
112
111
|
static inline void
|
113
|
-
unescape_unicode(const
|
112
|
+
unescape_unicode(const uint8_t *string, size_t length, uint32_t *value) {
|
114
113
|
*value = 0;
|
115
114
|
for (size_t index = 0; index < length; index++) {
|
116
115
|
if (index != 0) *value <<= 4;
|
@@ -122,27 +121,25 @@ unescape_unicode(const char *string, size_t length, uint32_t *value) {
|
|
122
121
|
// 32-bit value to write. Writes the UTF-8 representation of the value to the
|
123
122
|
// string and returns the number of bytes written.
|
124
123
|
static inline size_t
|
125
|
-
unescape_unicode_write(
|
126
|
-
unsigned char *bytes = (unsigned char *) dest;
|
127
|
-
|
124
|
+
unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, const uint8_t *end, yp_list_t *error_list) {
|
128
125
|
if (value <= 0x7F) {
|
129
126
|
// 0xxxxxxx
|
130
|
-
|
127
|
+
dest[0] = (uint8_t) value;
|
131
128
|
return 1;
|
132
129
|
}
|
133
130
|
|
134
131
|
if (value <= 0x7FF) {
|
135
132
|
// 110xxxxx 10xxxxxx
|
136
|
-
|
137
|
-
|
133
|
+
dest[0] = (uint8_t) (0xC0 | (value >> 6));
|
134
|
+
dest[1] = (uint8_t) (0x80 | (value & 0x3F));
|
138
135
|
return 2;
|
139
136
|
}
|
140
137
|
|
141
138
|
if (value <= 0xFFFF) {
|
142
139
|
// 1110xxxx 10xxxxxx 10xxxxxx
|
143
|
-
|
144
|
-
|
145
|
-
|
140
|
+
dest[0] = (uint8_t) (0xE0 | (value >> 12));
|
141
|
+
dest[1] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
|
142
|
+
dest[2] = (uint8_t) (0x80 | (value & 0x3F));
|
146
143
|
return 3;
|
147
144
|
}
|
148
145
|
|
@@ -150,20 +147,20 @@ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char
|
|
150
147
|
// the input is invalid.
|
151
148
|
if (value <= 0x10FFFF) {
|
152
149
|
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
150
|
+
dest[0] = (uint8_t) (0xF0 | (value >> 18));
|
151
|
+
dest[1] = (uint8_t) (0x80 | ((value >> 12) & 0x3F));
|
152
|
+
dest[2] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
|
153
|
+
dest[3] = (uint8_t) (0x80 | (value & 0x3F));
|
157
154
|
return 4;
|
158
155
|
}
|
159
156
|
|
160
157
|
// If we get here, then the value is too big. This is an error, but we don't
|
161
158
|
// want to just crash, so instead we'll add an error to the error list and put
|
162
159
|
// in a replacement character instead.
|
163
|
-
yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
|
164
|
-
|
165
|
-
|
166
|
-
|
160
|
+
if (error_list) yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
|
161
|
+
dest[0] = 0xEF;
|
162
|
+
dest[1] = 0xBF;
|
163
|
+
dest[2] = 0xBD;
|
167
164
|
return 3;
|
168
165
|
}
|
169
166
|
|
@@ -175,24 +172,30 @@ typedef enum {
|
|
175
172
|
} yp_unescape_flag_t;
|
176
173
|
|
177
174
|
// Unescape a single character value based on the given flags.
|
178
|
-
static inline
|
179
|
-
unescape_char(
|
180
|
-
unsigned char unescaped = value;
|
181
|
-
|
175
|
+
static inline uint8_t
|
176
|
+
unescape_char(uint8_t value, const uint8_t flags) {
|
182
177
|
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
|
183
|
-
|
178
|
+
value &= 0x1f;
|
184
179
|
}
|
185
180
|
|
186
181
|
if (flags & YP_UNESCAPE_FLAG_META) {
|
187
|
-
|
182
|
+
value |= 0x80;
|
188
183
|
}
|
189
184
|
|
190
|
-
return
|
185
|
+
return value;
|
191
186
|
}
|
192
187
|
|
193
188
|
// Read a specific escape sequence into the given destination.
|
194
|
-
static const
|
195
|
-
unescape(
|
189
|
+
static const uint8_t *
|
190
|
+
unescape(
|
191
|
+
yp_parser_t *parser,
|
192
|
+
uint8_t *dest,
|
193
|
+
size_t *dest_length,
|
194
|
+
const uint8_t *backslash,
|
195
|
+
const uint8_t *end,
|
196
|
+
const uint8_t flags,
|
197
|
+
yp_list_t *error_list
|
198
|
+
) {
|
196
199
|
switch (backslash[1]) {
|
197
200
|
case 'a':
|
198
201
|
case 'b':
|
@@ -203,28 +206,28 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
|
203
206
|
case 's':
|
204
207
|
case 't':
|
205
208
|
case 'v':
|
206
|
-
if (
|
207
|
-
dest[(*dest_length)++] =
|
209
|
+
if (dest) {
|
210
|
+
dest[(*dest_length)++] = unescape_char(unescape_chars[backslash[1]], flags);
|
208
211
|
}
|
209
212
|
return backslash + 2;
|
210
213
|
// \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
|
211
214
|
case '0': case '1': case '2': case '3': case '4':
|
212
215
|
case '5': case '6': case '7': case '8': case '9': {
|
213
|
-
|
214
|
-
const
|
216
|
+
uint8_t value;
|
217
|
+
const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
|
215
218
|
|
216
|
-
if (
|
217
|
-
dest[(*dest_length)++] =
|
219
|
+
if (dest) {
|
220
|
+
dest[(*dest_length)++] = unescape_char(value, flags);
|
218
221
|
}
|
219
222
|
return cursor;
|
220
223
|
}
|
221
224
|
// \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
|
222
225
|
case 'x': {
|
223
|
-
|
224
|
-
const
|
226
|
+
uint8_t value;
|
227
|
+
const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end, error_list);
|
225
228
|
|
226
|
-
if (
|
227
|
-
dest[(*dest_length)++] =
|
229
|
+
if (dest) {
|
230
|
+
dest[(*dest_length)++] = unescape_char(value, flags);
|
228
231
|
}
|
229
232
|
return cursor;
|
230
233
|
}
|
@@ -232,28 +235,28 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
|
232
235
|
// \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
|
233
236
|
case 'u': {
|
234
237
|
if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
|
235
|
-
yp_diagnostic_list_append(
|
238
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
|
236
239
|
return backslash + 2;
|
237
240
|
}
|
238
241
|
|
239
242
|
if ((backslash + 3) < end && backslash[2] == '{') {
|
240
|
-
const
|
241
|
-
const
|
243
|
+
const uint8_t *unicode_cursor = backslash + 3;
|
244
|
+
const uint8_t *extra_codepoints_start = NULL;
|
242
245
|
int codepoints_count = 0;
|
243
246
|
|
244
247
|
unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
|
245
248
|
|
246
|
-
while ((
|
247
|
-
const
|
249
|
+
while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
|
250
|
+
const uint8_t *unicode_start = unicode_cursor;
|
248
251
|
size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
|
249
252
|
|
250
253
|
// \u{nnnn} character literal allows only 1-6 hexadecimal digits
|
251
|
-
if (hexadecimal_length > 6)
|
252
|
-
yp_diagnostic_list_append(
|
253
|
-
|
254
|
+
if (hexadecimal_length > 6) {
|
255
|
+
if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
|
256
|
+
}
|
254
257
|
// there are not hexadecimal characters
|
255
|
-
if (hexadecimal_length == 0) {
|
256
|
-
yp_diagnostic_list_append(
|
258
|
+
else if (hexadecimal_length == 0) {
|
259
|
+
if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
|
257
260
|
return unicode_cursor;
|
258
261
|
}
|
259
262
|
|
@@ -265,31 +268,37 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
|
265
268
|
|
266
269
|
uint32_t value;
|
267
270
|
unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
|
268
|
-
if (
|
269
|
-
*dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor,
|
271
|
+
if (dest) {
|
272
|
+
*dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
|
270
273
|
}
|
271
274
|
|
272
275
|
unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
|
273
276
|
}
|
274
277
|
|
275
278
|
// ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
|
276
|
-
if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
|
277
|
-
yp_diagnostic_list_append(
|
279
|
+
if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) {
|
280
|
+
if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
|
281
|
+
}
|
278
282
|
|
279
|
-
|
280
|
-
|
283
|
+
if (unicode_cursor < end && *unicode_cursor == '}') {
|
284
|
+
unicode_cursor++;
|
285
|
+
} else {
|
286
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, "invalid Unicode escape.");
|
287
|
+
}
|
281
288
|
|
282
|
-
|
289
|
+
return unicode_cursor;
|
290
|
+
}
|
291
|
+
else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
|
283
292
|
uint32_t value;
|
284
293
|
unescape_unicode(backslash + 2, 4, &value);
|
285
294
|
|
286
|
-
if (
|
287
|
-
*dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6,
|
295
|
+
if (dest) {
|
296
|
+
*dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
|
288
297
|
}
|
289
298
|
return backslash + 6;
|
290
299
|
}
|
291
300
|
|
292
|
-
yp_diagnostic_list_append(
|
301
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
|
293
302
|
return backslash + 2;
|
294
303
|
}
|
295
304
|
// \c\M-x meta control character, where x is an ASCII printable character
|
@@ -297,31 +306,31 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
|
297
306
|
// \cx control character, where x is an ASCII printable character
|
298
307
|
case 'c':
|
299
308
|
if (backslash + 2 >= end) {
|
300
|
-
yp_diagnostic_list_append(
|
309
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
301
310
|
return end;
|
302
311
|
}
|
303
312
|
|
304
313
|
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
|
305
|
-
yp_diagnostic_list_append(
|
314
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
|
306
315
|
return backslash + 2;
|
307
316
|
}
|
308
317
|
|
309
318
|
switch (backslash[2]) {
|
310
319
|
case '\\':
|
311
|
-
return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL,
|
320
|
+
return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
|
312
321
|
case '?':
|
313
|
-
if (
|
314
|
-
dest[(*dest_length)++] =
|
322
|
+
if (dest) {
|
323
|
+
dest[(*dest_length)++] = unescape_char(0x7f, flags);
|
315
324
|
}
|
316
325
|
return backslash + 3;
|
317
326
|
default: {
|
318
327
|
if (!char_is_ascii_printable(backslash[2])) {
|
319
|
-
yp_diagnostic_list_append(
|
328
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
320
329
|
return backslash + 2;
|
321
330
|
}
|
322
331
|
|
323
|
-
if (
|
324
|
-
dest[(*dest_length)++] =
|
332
|
+
if (dest) {
|
333
|
+
dest[(*dest_length)++] = unescape_char(backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
|
325
334
|
}
|
326
335
|
return backslash + 3;
|
327
336
|
}
|
@@ -330,36 +339,36 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
|
330
339
|
// \C-? delete, ASCII 7Fh (DEL)
|
331
340
|
case 'C':
|
332
341
|
if (backslash + 3 >= end) {
|
333
|
-
yp_diagnostic_list_append(
|
342
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
334
343
|
return end;
|
335
344
|
}
|
336
345
|
|
337
346
|
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
|
338
|
-
yp_diagnostic_list_append(
|
347
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
|
339
348
|
return backslash + 2;
|
340
349
|
}
|
341
350
|
|
342
351
|
if (backslash[2] != '-') {
|
343
|
-
yp_diagnostic_list_append(
|
352
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
344
353
|
return backslash + 2;
|
345
354
|
}
|
346
355
|
|
347
356
|
switch (backslash[3]) {
|
348
357
|
case '\\':
|
349
|
-
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL,
|
358
|
+
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
|
350
359
|
case '?':
|
351
|
-
if (
|
352
|
-
dest[(*dest_length)++] =
|
360
|
+
if (dest) {
|
361
|
+
dest[(*dest_length)++] = unescape_char(0x7f, flags);
|
353
362
|
}
|
354
363
|
return backslash + 4;
|
355
364
|
default:
|
356
365
|
if (!char_is_ascii_printable(backslash[3])) {
|
357
|
-
yp_diagnostic_list_append(
|
366
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
|
358
367
|
return backslash + 2;
|
359
368
|
}
|
360
369
|
|
361
|
-
if (
|
362
|
-
dest[(*dest_length)++] =
|
370
|
+
if (dest) {
|
371
|
+
dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
|
363
372
|
}
|
364
373
|
return backslash + 4;
|
365
374
|
}
|
@@ -368,32 +377,32 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
|
368
377
|
// \M-x meta character, where x is an ASCII printable character
|
369
378
|
case 'M': {
|
370
379
|
if (backslash + 3 >= end) {
|
371
|
-
yp_diagnostic_list_append(
|
380
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
|
372
381
|
return end;
|
373
382
|
}
|
374
383
|
|
375
384
|
if (flags & YP_UNESCAPE_FLAG_META) {
|
376
|
-
yp_diagnostic_list_append(
|
385
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
|
377
386
|
return backslash + 2;
|
378
387
|
}
|
379
388
|
|
380
389
|
if (backslash[2] != '-') {
|
381
|
-
yp_diagnostic_list_append(
|
390
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
|
382
391
|
return backslash + 2;
|
383
392
|
}
|
384
393
|
|
385
394
|
if (backslash[3] == '\\') {
|
386
|
-
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META,
|
395
|
+
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list);
|
387
396
|
}
|
388
397
|
|
389
398
|
if (char_is_ascii_printable(backslash[3])) {
|
390
|
-
if (
|
391
|
-
dest[(*dest_length)++] =
|
399
|
+
if (dest) {
|
400
|
+
dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_META);
|
392
401
|
}
|
393
402
|
return backslash + 4;
|
394
403
|
}
|
395
404
|
|
396
|
-
yp_diagnostic_list_append(
|
405
|
+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
|
397
406
|
return backslash + 3;
|
398
407
|
}
|
399
408
|
// \n
|
@@ -409,7 +418,7 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
|
409
418
|
default: {
|
410
419
|
size_t width = yp_char_width(parser, backslash + 1, end);
|
411
420
|
|
412
|
-
if (
|
421
|
+
if (dest) {
|
413
422
|
memcpy(dest + *dest_length, backslash + 1, width);
|
414
423
|
*dest_length += width;
|
415
424
|
}
|
@@ -447,14 +456,14 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
|
|
447
456
|
// \c\M-x same as above
|
448
457
|
// \c? or \C-? delete, ASCII 7Fh (DEL)
|
449
458
|
//
|
450
|
-
|
451
|
-
|
459
|
+
static void
|
460
|
+
yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
|
452
461
|
if (unescape_type == YP_UNESCAPE_NONE) {
|
453
462
|
// If we're not unescaping then we can reference the source directly.
|
454
463
|
return;
|
455
464
|
}
|
456
465
|
|
457
|
-
const
|
466
|
+
const uint8_t *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
|
458
467
|
|
459
468
|
if (backslash == NULL) {
|
460
469
|
// Here there are no escapes, so we can reference the source directly.
|
@@ -463,21 +472,21 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
|
463
472
|
|
464
473
|
// Here we have found an escape character, so we need to handle all escapes
|
465
474
|
// within the string.
|
466
|
-
|
475
|
+
uint8_t *allocated = malloc(string->length);
|
467
476
|
if (allocated == NULL) {
|
468
477
|
yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
|
469
478
|
return;
|
470
479
|
}
|
471
480
|
|
472
481
|
// This is the memory address where we're putting the unescaped string.
|
473
|
-
|
482
|
+
uint8_t *dest = allocated;
|
474
483
|
size_t dest_length = 0;
|
475
484
|
|
476
485
|
// This is the current position in the source string that we're looking at.
|
477
486
|
// It's going to move along behind the backslash so that we can copy each
|
478
487
|
// segment of the string that doesn't contain an escape.
|
479
|
-
const
|
480
|
-
const
|
488
|
+
const uint8_t *cursor = string->source;
|
489
|
+
const uint8_t *end = string->source + string->length;
|
481
490
|
|
482
491
|
// For each escape found in the source string, we will handle it and update
|
483
492
|
// the moving cursor->backslash window.
|
@@ -496,7 +505,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
|
496
505
|
switch (backslash[1]) {
|
497
506
|
case '\\':
|
498
507
|
case '\'':
|
499
|
-
dest[dest_length++] =
|
508
|
+
dest[dest_length++] = unescape_chars[backslash[1]];
|
500
509
|
cursor = backslash + 2;
|
501
510
|
break;
|
502
511
|
default:
|
@@ -510,7 +519,13 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
|
510
519
|
// This is the only type of unescaping left. In this case we need to
|
511
520
|
// handle all of the different unescapes.
|
512
521
|
assert(unescape_type == YP_UNESCAPE_ALL);
|
513
|
-
|
522
|
+
|
523
|
+
uint8_t flags = YP_UNESCAPE_FLAG_NONE;
|
524
|
+
if (expect_single_codepoint) {
|
525
|
+
flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
|
526
|
+
}
|
527
|
+
|
528
|
+
cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list);
|
514
529
|
break;
|
515
530
|
}
|
516
531
|
|
@@ -538,13 +553,27 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
|
|
538
553
|
yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
|
539
554
|
}
|
540
555
|
|
556
|
+
YP_EXPORTED_FUNCTION void
|
557
|
+
yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
|
558
|
+
yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false);
|
559
|
+
}
|
560
|
+
|
561
|
+
void
|
562
|
+
yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
|
563
|
+
yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true);
|
564
|
+
}
|
565
|
+
|
541
566
|
// This function is similar to yp_unescape_manipulate_string, except it doesn't
|
542
567
|
// actually perform any string manipulations. Instead, it calculates how long
|
543
568
|
// the unescaped character is, and returns that value
|
544
569
|
size_t
|
545
|
-
yp_unescape_calculate_difference(yp_parser_t *parser, const
|
570
|
+
yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
|
546
571
|
assert(unescape_type != YP_UNESCAPE_NONE);
|
547
572
|
|
573
|
+
if (backslash + 1 >= parser->end) {
|
574
|
+
return 0;
|
575
|
+
}
|
576
|
+
|
548
577
|
switch (backslash[1]) {
|
549
578
|
case '\\':
|
550
579
|
case '\'':
|
@@ -558,11 +587,12 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_
|
|
558
587
|
// handle all of the different unescapes.
|
559
588
|
assert(unescape_type == YP_UNESCAPE_ALL);
|
560
589
|
|
561
|
-
|
562
|
-
if (expect_single_codepoint)
|
590
|
+
uint8_t flags = YP_UNESCAPE_FLAG_NONE;
|
591
|
+
if (expect_single_codepoint) {
|
563
592
|
flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
|
593
|
+
}
|
564
594
|
|
565
|
-
const
|
595
|
+
const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL);
|
566
596
|
assert(cursor > backslash);
|
567
597
|
|
568
598
|
return (size_t) (cursor - backslash);
|
@@ -574,7 +604,7 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_
|
|
574
604
|
// string, a type of unescaping, and a pointer to a result string. It returns a
|
575
605
|
// boolean indicating whether or not the unescaping was successful.
|
576
606
|
YP_EXPORTED_FUNCTION bool
|
577
|
-
yp_unescape_string(const
|
607
|
+
yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
|
578
608
|
yp_parser_t parser;
|
579
609
|
yp_parser_init(&parser, start, length, NULL);
|
580
610
|
|
data/src/util/yp_buffer.c
CHANGED
@@ -63,8 +63,13 @@ yp_buffer_append_zeroes(yp_buffer_t *buffer, size_t length) {
|
|
63
63
|
// Append a string to the buffer.
|
64
64
|
void
|
65
65
|
yp_buffer_append_str(yp_buffer_t *buffer, const char *value, size_t length) {
|
66
|
-
|
67
|
-
|
66
|
+
yp_buffer_append(buffer, value, length);
|
67
|
+
}
|
68
|
+
|
69
|
+
// Append a list of bytes to the buffer.
|
70
|
+
void
|
71
|
+
yp_buffer_append_bytes(yp_buffer_t *buffer, const uint8_t *value, size_t length) {
|
72
|
+
yp_buffer_append(buffer, (const char *) value, length);
|
68
73
|
}
|
69
74
|
|
70
75
|
// Append a single byte to the buffer.
|