yarp 0.9.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +15 -1
  3. data/Makefile +5 -1
  4. data/config.yml +156 -125
  5. data/docs/encoding.md +5 -5
  6. data/docs/serialization.md +2 -2
  7. data/ext/yarp/api_node.c +142 -98
  8. data/ext/yarp/extension.c +21 -7
  9. data/ext/yarp/extension.h +1 -1
  10. data/include/yarp/ast.h +327 -18
  11. data/include/yarp/defines.h +2 -1
  12. data/include/yarp/diagnostic.h +3 -3
  13. data/include/yarp/enc/yp_encoding.h +10 -10
  14. data/include/yarp/parser.h +19 -19
  15. data/include/yarp/regexp.h +1 -1
  16. data/include/yarp/unescape.h +4 -4
  17. data/include/yarp/util/yp_buffer.h +3 -0
  18. data/include/yarp/util/yp_char.h +16 -16
  19. data/include/yarp/util/yp_constant_pool.h +2 -2
  20. data/include/yarp/util/yp_newline_list.h +5 -5
  21. data/include/yarp/util/yp_string.h +4 -4
  22. data/include/yarp/util/yp_string_list.h +0 -3
  23. data/include/yarp/util/yp_strpbrk.h +1 -1
  24. data/include/yarp/version.h +2 -2
  25. data/include/yarp.h +5 -4
  26. data/lib/yarp/desugar_visitor.rb +59 -122
  27. data/lib/yarp/node.rb +230 -240
  28. data/lib/yarp/serialize.rb +16 -16
  29. data/lib/yarp.rb +5 -5
  30. data/src/diagnostic.c +1 -1
  31. data/src/enc/yp_big5.c +15 -42
  32. data/src/enc/yp_euc_jp.c +16 -43
  33. data/src/enc/yp_gbk.c +19 -46
  34. data/src/enc/yp_shift_jis.c +16 -43
  35. data/src/enc/yp_tables.c +36 -38
  36. data/src/enc/yp_unicode.c +20 -25
  37. data/src/enc/yp_windows_31j.c +16 -43
  38. data/src/node.c +1271 -899
  39. data/src/prettyprint.c +87 -48
  40. data/src/regexp.c +21 -21
  41. data/src/serialize.c +28 -15
  42. data/src/unescape.c +151 -121
  43. data/src/util/yp_buffer.c +7 -2
  44. data/src/util/yp_char.c +34 -34
  45. data/src/util/yp_constant_pool.c +4 -4
  46. data/src/util/yp_memchr.c +1 -1
  47. data/src/util/yp_newline_list.c +5 -4
  48. data/src/util/yp_string.c +22 -20
  49. data/src/util/yp_string_list.c +0 -6
  50. data/src/util/yp_strncasecmp.c +3 -6
  51. data/src/util/yp_strpbrk.c +8 -8
  52. data/src/yarp.c +355 -216
  53. data/yarp.gemspec +1 -1
  54. metadata +2 -2
data/src/unescape.c CHANGED
@@ -5,9 +5,9 @@
5
5
  /******************************************************************************/
6
6
 
7
7
  static inline bool
8
- yp_char_is_hexadecimal_digits(const char *c, size_t length) {
8
+ yp_char_is_hexadecimal_digits(const uint8_t *string, size_t length) {
9
9
  for (size_t index = 0; index < length; index++) {
10
- if (!yp_char_is_hexadecimal_digit(c[index])) {
10
+ if (!yp_char_is_hexadecimal_digit(string[index])) {
11
11
  return false;
12
12
  }
13
13
  }
@@ -18,10 +18,8 @@ yp_char_is_hexadecimal_digits(const char *c, size_t length) {
18
18
  // expensive to go through the indirection of the function pointer. Instead we
19
19
  // provide a fast path that will check if we can just return 1.
20
20
  static inline size_t
21
- yp_char_width(yp_parser_t *parser, const char *start, const char *end) {
22
- const unsigned char *uc = (const unsigned char *) start;
23
-
24
- if (parser->encoding_changed || (*uc >= 0x80)) {
21
+ yp_char_width(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
22
+ if (parser->encoding_changed || (*start >= 0x80)) {
25
23
  return parser->encoding.char_width(start, end - start);
26
24
  } else {
27
25
  return 1;
@@ -33,7 +31,7 @@ yp_char_width(yp_parser_t *parser, const char *start, const char *end) {
33
31
  /******************************************************************************/
34
32
 
35
33
  // This is a lookup table for unescapes that only take up a single character.
36
- static const unsigned char unescape_chars[] = {
34
+ static const uint8_t unescape_chars[] = {
37
35
  ['\''] = '\'',
38
36
  ['\\'] = '\\',
39
37
  ['a'] = '\a',
@@ -60,9 +58,8 @@ static const bool ascii_printable_chars[] = {
60
58
  };
61
59
 
62
60
  static inline bool
63
- char_is_ascii_printable(const char c) {
64
- unsigned char v = (unsigned char) c;
65
- return (v < 0x80) && ascii_printable_chars[v];
61
+ char_is_ascii_printable(const uint8_t b) {
62
+ return (b < 0x80) && ascii_printable_chars[b];
66
63
  }
67
64
 
68
65
  /******************************************************************************/
@@ -72,37 +69,39 @@ char_is_ascii_printable(const char c) {
72
69
  // Scan the 1-3 digits of octal into the value. Returns the number of digits
73
70
  // scanned.
74
71
  static inline size_t
75
- unescape_octal(const char *backslash, unsigned char *value) {
76
- *value = (unsigned char) (backslash[1] - '0');
77
- if (!yp_char_is_octal_digit(backslash[2])) {
72
+ unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
73
+ *value = (uint8_t) (backslash[1] - '0');
74
+ if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
78
75
  return 2;
79
76
  }
80
-
81
- *value = (unsigned char) ((*value << 3) | (backslash[2] - '0'));
82
- if (!yp_char_is_octal_digit(backslash[3])) {
77
+ *value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
78
+ if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
83
79
  return 3;
84
80
  }
85
-
86
- *value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
81
+ *value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
87
82
  return 4;
88
83
  }
89
84
 
90
85
  // Convert a hexadecimal digit into its equivalent value.
91
- static inline unsigned char
92
- unescape_hexadecimal_digit(const char value) {
93
- return (unsigned char) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
86
+ static inline uint8_t
87
+ unescape_hexadecimal_digit(const uint8_t value) {
88
+ return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
94
89
  }
95
90
 
96
91
  // Scan the 1-2 digits of hexadecimal into the value. Returns the number of
97
92
  // digits scanned.
98
93
  static inline size_t
99
- unescape_hexadecimal(const char *backslash, unsigned char *value) {
94
+ unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end, yp_list_t *error_list) {
95
+ *value = 0;
96
+ if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
97
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid hex escape.");
98
+ return 2;
99
+ }
100
100
  *value = unescape_hexadecimal_digit(backslash[2]);
101
- if (!yp_char_is_hexadecimal_digit(backslash[3])) {
101
+ if (backslash + 3 >= end || !yp_char_is_hexadecimal_digit(backslash[3])) {
102
102
  return 3;
103
103
  }
104
-
105
- *value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
104
+ *value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
106
105
  return 4;
107
106
  }
108
107
 
@@ -110,7 +109,7 @@ unescape_hexadecimal(const char *backslash, unsigned char *value) {
110
109
  // digits scanned. This function assumes that the characters have already been
111
110
  // validated.
112
111
  static inline void
113
- unescape_unicode(const char *string, size_t length, uint32_t *value) {
112
+ unescape_unicode(const uint8_t *string, size_t length, uint32_t *value) {
114
113
  *value = 0;
115
114
  for (size_t index = 0; index < length; index++) {
116
115
  if (index != 0) *value <<= 4;
@@ -122,27 +121,25 @@ unescape_unicode(const char *string, size_t length, uint32_t *value) {
122
121
  // 32-bit value to write. Writes the UTF-8 representation of the value to the
123
122
  // string and returns the number of bytes written.
124
123
  static inline size_t
125
- unescape_unicode_write(char *dest, uint32_t value, const char *start, const char *end, yp_list_t *error_list) {
126
- unsigned char *bytes = (unsigned char *) dest;
127
-
124
+ unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, const uint8_t *end, yp_list_t *error_list) {
128
125
  if (value <= 0x7F) {
129
126
  // 0xxxxxxx
130
- bytes[0] = (unsigned char) value;
127
+ dest[0] = (uint8_t) value;
131
128
  return 1;
132
129
  }
133
130
 
134
131
  if (value <= 0x7FF) {
135
132
  // 110xxxxx 10xxxxxx
136
- bytes[0] = (unsigned char) (0xC0 | (value >> 6));
137
- bytes[1] = (unsigned char) (0x80 | (value & 0x3F));
133
+ dest[0] = (uint8_t) (0xC0 | (value >> 6));
134
+ dest[1] = (uint8_t) (0x80 | (value & 0x3F));
138
135
  return 2;
139
136
  }
140
137
 
141
138
  if (value <= 0xFFFF) {
142
139
  // 1110xxxx 10xxxxxx 10xxxxxx
143
- bytes[0] = (unsigned char) (0xE0 | (value >> 12));
144
- bytes[1] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
145
- bytes[2] = (unsigned char) (0x80 | (value & 0x3F));
140
+ dest[0] = (uint8_t) (0xE0 | (value >> 12));
141
+ dest[1] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
142
+ dest[2] = (uint8_t) (0x80 | (value & 0x3F));
146
143
  return 3;
147
144
  }
148
145
 
@@ -150,20 +147,20 @@ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char
150
147
  // the input is invalid.
151
148
  if (value <= 0x10FFFF) {
152
149
  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
153
- bytes[0] = (unsigned char) (0xF0 | (value >> 18));
154
- bytes[1] = (unsigned char) (0x80 | ((value >> 12) & 0x3F));
155
- bytes[2] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
156
- bytes[3] = (unsigned char) (0x80 | (value & 0x3F));
150
+ dest[0] = (uint8_t) (0xF0 | (value >> 18));
151
+ dest[1] = (uint8_t) (0x80 | ((value >> 12) & 0x3F));
152
+ dest[2] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
153
+ dest[3] = (uint8_t) (0x80 | (value & 0x3F));
157
154
  return 4;
158
155
  }
159
156
 
160
157
  // If we get here, then the value is too big. This is an error, but we don't
161
158
  // want to just crash, so instead we'll add an error to the error list and put
162
159
  // in a replacement character instead.
163
- yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
164
- bytes[0] = 0xEF;
165
- bytes[1] = 0xBF;
166
- bytes[2] = 0xBD;
160
+ if (error_list) yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
161
+ dest[0] = 0xEF;
162
+ dest[1] = 0xBF;
163
+ dest[2] = 0xBD;
167
164
  return 3;
168
165
  }
169
166
 
@@ -175,24 +172,30 @@ typedef enum {
175
172
  } yp_unescape_flag_t;
176
173
 
177
174
  // Unescape a single character value based on the given flags.
178
- static inline unsigned char
179
- unescape_char(const unsigned char value, const unsigned char flags) {
180
- unsigned char unescaped = value;
181
-
175
+ static inline uint8_t
176
+ unescape_char(uint8_t value, const uint8_t flags) {
182
177
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
183
- unescaped &= 0x1f;
178
+ value &= 0x1f;
184
179
  }
185
180
 
186
181
  if (flags & YP_UNESCAPE_FLAG_META) {
187
- unescaped |= 0x80;
182
+ value |= 0x80;
188
183
  }
189
184
 
190
- return unescaped;
185
+ return value;
191
186
  }
192
187
 
193
188
  // Read a specific escape sequence into the given destination.
194
- static const char *
195
- unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backslash, const char *end, const unsigned char flags, bool write_to_str) {
189
+ static const uint8_t *
190
+ unescape(
191
+ yp_parser_t *parser,
192
+ uint8_t *dest,
193
+ size_t *dest_length,
194
+ const uint8_t *backslash,
195
+ const uint8_t *end,
196
+ const uint8_t flags,
197
+ yp_list_t *error_list
198
+ ) {
196
199
  switch (backslash[1]) {
197
200
  case 'a':
198
201
  case 'b':
@@ -203,28 +206,28 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
203
206
  case 's':
204
207
  case 't':
205
208
  case 'v':
206
- if (write_to_str) {
207
- dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
209
+ if (dest) {
210
+ dest[(*dest_length)++] = unescape_char(unescape_chars[backslash[1]], flags);
208
211
  }
209
212
  return backslash + 2;
210
213
  // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
211
214
  case '0': case '1': case '2': case '3': case '4':
212
215
  case '5': case '6': case '7': case '8': case '9': {
213
- unsigned char value;
214
- const char *cursor = backslash + unescape_octal(backslash, &value);
216
+ uint8_t value;
217
+ const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
215
218
 
216
- if (write_to_str) {
217
- dest[(*dest_length)++] = (char) unescape_char(value, flags);
219
+ if (dest) {
220
+ dest[(*dest_length)++] = unescape_char(value, flags);
218
221
  }
219
222
  return cursor;
220
223
  }
221
224
  // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
222
225
  case 'x': {
223
- unsigned char value;
224
- const char *cursor = backslash + unescape_hexadecimal(backslash, &value);
226
+ uint8_t value;
227
+ const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end, error_list);
225
228
 
226
- if (write_to_str) {
227
- dest[(*dest_length)++] = (char) unescape_char(value, flags);
229
+ if (dest) {
230
+ dest[(*dest_length)++] = unescape_char(value, flags);
228
231
  }
229
232
  return cursor;
230
233
  }
@@ -232,28 +235,28 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
232
235
  // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
233
236
  case 'u': {
234
237
  if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
235
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
238
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
236
239
  return backslash + 2;
237
240
  }
238
241
 
239
242
  if ((backslash + 3) < end && backslash[2] == '{') {
240
- const char *unicode_cursor = backslash + 3;
241
- const char *extra_codepoints_start = NULL;
243
+ const uint8_t *unicode_cursor = backslash + 3;
244
+ const uint8_t *extra_codepoints_start = NULL;
242
245
  int codepoints_count = 0;
243
246
 
244
247
  unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
245
248
 
246
- while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
247
- const char *unicode_start = unicode_cursor;
249
+ while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
250
+ const uint8_t *unicode_start = unicode_cursor;
248
251
  size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
249
252
 
250
253
  // \u{nnnn} character literal allows only 1-6 hexadecimal digits
251
- if (hexadecimal_length > 6)
252
- yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
253
-
254
+ if (hexadecimal_length > 6) {
255
+ if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
256
+ }
254
257
  // there are not hexadecimal characters
255
- if (hexadecimal_length == 0) {
256
- yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
258
+ else if (hexadecimal_length == 0) {
259
+ if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
257
260
  return unicode_cursor;
258
261
  }
259
262
 
@@ -265,31 +268,37 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
265
268
 
266
269
  uint32_t value;
267
270
  unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
268
- if (write_to_str) {
269
- *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, &parser->error_list);
271
+ if (dest) {
272
+ *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
270
273
  }
271
274
 
272
275
  unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
273
276
  }
274
277
 
275
278
  // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
276
- if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
277
- yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
279
+ if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) {
280
+ if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
281
+ }
278
282
 
279
- return unicode_cursor + 1;
280
- }
283
+ if (unicode_cursor < end && *unicode_cursor == '}') {
284
+ unicode_cursor++;
285
+ } else {
286
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, "invalid Unicode escape.");
287
+ }
281
288
 
282
- if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
289
+ return unicode_cursor;
290
+ }
291
+ else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
283
292
  uint32_t value;
284
293
  unescape_unicode(backslash + 2, 4, &value);
285
294
 
286
- if (write_to_str) {
287
- *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, &parser->error_list);
295
+ if (dest) {
296
+ *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
288
297
  }
289
298
  return backslash + 6;
290
299
  }
291
300
 
292
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
301
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
293
302
  return backslash + 2;
294
303
  }
295
304
  // \c\M-x meta control character, where x is an ASCII printable character
@@ -297,31 +306,31 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
297
306
  // \cx control character, where x is an ASCII printable character
298
307
  case 'c':
299
308
  if (backslash + 2 >= end) {
300
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
309
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
301
310
  return end;
302
311
  }
303
312
 
304
313
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
305
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
314
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
306
315
  return backslash + 2;
307
316
  }
308
317
 
309
318
  switch (backslash[2]) {
310
319
  case '\\':
311
- return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
320
+ return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
312
321
  case '?':
313
- if (write_to_str) {
314
- dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
322
+ if (dest) {
323
+ dest[(*dest_length)++] = unescape_char(0x7f, flags);
315
324
  }
316
325
  return backslash + 3;
317
326
  default: {
318
327
  if (!char_is_ascii_printable(backslash[2])) {
319
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
328
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
320
329
  return backslash + 2;
321
330
  }
322
331
 
323
- if (write_to_str) {
324
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
332
+ if (dest) {
333
+ dest[(*dest_length)++] = unescape_char(backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
325
334
  }
326
335
  return backslash + 3;
327
336
  }
@@ -330,36 +339,36 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
330
339
  // \C-? delete, ASCII 7Fh (DEL)
331
340
  case 'C':
332
341
  if (backslash + 3 >= end) {
333
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
342
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
334
343
  return end;
335
344
  }
336
345
 
337
346
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
338
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
347
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
339
348
  return backslash + 2;
340
349
  }
341
350
 
342
351
  if (backslash[2] != '-') {
343
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
352
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
344
353
  return backslash + 2;
345
354
  }
346
355
 
347
356
  switch (backslash[3]) {
348
357
  case '\\':
349
- return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
358
+ return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
350
359
  case '?':
351
- if (write_to_str) {
352
- dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
360
+ if (dest) {
361
+ dest[(*dest_length)++] = unescape_char(0x7f, flags);
353
362
  }
354
363
  return backslash + 4;
355
364
  default:
356
365
  if (!char_is_ascii_printable(backslash[3])) {
357
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid control escape sequence");
366
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
358
367
  return backslash + 2;
359
368
  }
360
369
 
361
- if (write_to_str) {
362
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
370
+ if (dest) {
371
+ dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
363
372
  }
364
373
  return backslash + 4;
365
374
  }
@@ -368,32 +377,32 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
368
377
  // \M-x meta character, where x is an ASCII printable character
369
378
  case 'M': {
370
379
  if (backslash + 3 >= end) {
371
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
380
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
372
381
  return end;
373
382
  }
374
383
 
375
384
  if (flags & YP_UNESCAPE_FLAG_META) {
376
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
385
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
377
386
  return backslash + 2;
378
387
  }
379
388
 
380
389
  if (backslash[2] != '-') {
381
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
390
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
382
391
  return backslash + 2;
383
392
  }
384
393
 
385
394
  if (backslash[3] == '\\') {
386
- return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, write_to_str);
395
+ return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list);
387
396
  }
388
397
 
389
398
  if (char_is_ascii_printable(backslash[3])) {
390
- if (write_to_str) {
391
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_META);
399
+ if (dest) {
400
+ dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_META);
392
401
  }
393
402
  return backslash + 4;
394
403
  }
395
404
 
396
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
405
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
397
406
  return backslash + 3;
398
407
  }
399
408
  // \n
@@ -409,7 +418,7 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
409
418
  default: {
410
419
  size_t width = yp_char_width(parser, backslash + 1, end);
411
420
 
412
- if (write_to_str) {
421
+ if (dest) {
413
422
  memcpy(dest + *dest_length, backslash + 1, width);
414
423
  *dest_length += width;
415
424
  }
@@ -447,14 +456,14 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
447
456
  // \c\M-x same as above
448
457
  // \c? or \C-? delete, ASCII 7Fh (DEL)
449
458
  //
450
- YP_EXPORTED_FUNCTION void
451
- yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
459
+ static void
460
+ yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
452
461
  if (unescape_type == YP_UNESCAPE_NONE) {
453
462
  // If we're not unescaping then we can reference the source directly.
454
463
  return;
455
464
  }
456
465
 
457
- const char *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
466
+ const uint8_t *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
458
467
 
459
468
  if (backslash == NULL) {
460
469
  // Here there are no escapes, so we can reference the source directly.
@@ -463,21 +472,21 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
463
472
 
464
473
  // Here we have found an escape character, so we need to handle all escapes
465
474
  // within the string.
466
- char *allocated = malloc(string->length);
475
+ uint8_t *allocated = malloc(string->length);
467
476
  if (allocated == NULL) {
468
477
  yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
469
478
  return;
470
479
  }
471
480
 
472
481
  // This is the memory address where we're putting the unescaped string.
473
- char *dest = allocated;
482
+ uint8_t *dest = allocated;
474
483
  size_t dest_length = 0;
475
484
 
476
485
  // This is the current position in the source string that we're looking at.
477
486
  // It's going to move along behind the backslash so that we can copy each
478
487
  // segment of the string that doesn't contain an escape.
479
- const char *cursor = string->source;
480
- const char *end = string->source + string->length;
488
+ const uint8_t *cursor = string->source;
489
+ const uint8_t *end = string->source + string->length;
481
490
 
482
491
  // For each escape found in the source string, we will handle it and update
483
492
  // the moving cursor->backslash window.
@@ -496,7 +505,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
496
505
  switch (backslash[1]) {
497
506
  case '\\':
498
507
  case '\'':
499
- dest[dest_length++] = (char) unescape_chars[(unsigned char) backslash[1]];
508
+ dest[dest_length++] = unescape_chars[backslash[1]];
500
509
  cursor = backslash + 2;
501
510
  break;
502
511
  default:
@@ -510,7 +519,13 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
510
519
  // This is the only type of unescaping left. In this case we need to
511
520
  // handle all of the different unescapes.
512
521
  assert(unescape_type == YP_UNESCAPE_ALL);
513
- cursor = unescape(parser, dest, &dest_length, backslash, end, YP_UNESCAPE_FLAG_NONE, true);
522
+
523
+ uint8_t flags = YP_UNESCAPE_FLAG_NONE;
524
+ if (expect_single_codepoint) {
525
+ flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
526
+ }
527
+
528
+ cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list);
514
529
  break;
515
530
  }
516
531
 
@@ -538,13 +553,27 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
538
553
  yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
539
554
  }
540
555
 
556
+ YP_EXPORTED_FUNCTION void
557
+ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
558
+ yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false);
559
+ }
560
+
561
+ void
562
+ yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
563
+ yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true);
564
+ }
565
+
541
566
  // This function is similar to yp_unescape_manipulate_string, except it doesn't
542
567
  // actually perform any string manipulations. Instead, it calculates how long
543
568
  // the unescaped character is, and returns that value
544
569
  size_t
545
- yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
570
+ yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
546
571
  assert(unescape_type != YP_UNESCAPE_NONE);
547
572
 
573
+ if (backslash + 1 >= parser->end) {
574
+ return 0;
575
+ }
576
+
548
577
  switch (backslash[1]) {
549
578
  case '\\':
550
579
  case '\'':
@@ -558,11 +587,12 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_
558
587
  // handle all of the different unescapes.
559
588
  assert(unescape_type == YP_UNESCAPE_ALL);
560
589
 
561
- unsigned char flags = YP_UNESCAPE_FLAG_NONE;
562
- if (expect_single_codepoint)
590
+ uint8_t flags = YP_UNESCAPE_FLAG_NONE;
591
+ if (expect_single_codepoint) {
563
592
  flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
593
+ }
564
594
 
565
- const char *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, false);
595
+ const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL);
566
596
  assert(cursor > backslash);
567
597
 
568
598
  return (size_t) (cursor - backslash);
@@ -574,7 +604,7 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_
574
604
  // string, a type of unescaping, and a pointer to a result string. It returns a
575
605
  // boolean indicating whether or not the unescaping was successful.
576
606
  YP_EXPORTED_FUNCTION bool
577
- yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
607
+ yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
578
608
  yp_parser_t parser;
579
609
  yp_parser_init(&parser, start, length, NULL);
580
610
 
data/src/util/yp_buffer.c CHANGED
@@ -63,8 +63,13 @@ yp_buffer_append_zeroes(yp_buffer_t *buffer, size_t length) {
63
63
  // Append a string to the buffer.
64
64
  void
65
65
  yp_buffer_append_str(yp_buffer_t *buffer, const char *value, size_t length) {
66
- const void *source = value;
67
- yp_buffer_append(buffer, source, length);
66
+ yp_buffer_append(buffer, value, length);
67
+ }
68
+
69
+ // Append a list of bytes to the buffer.
70
+ void
71
+ yp_buffer_append_bytes(yp_buffer_t *buffer, const uint8_t *value, size_t length) {
72
+ yp_buffer_append(buffer, (const char *) value, length);
68
73
  }
69
74
 
70
75
  // Append a single byte to the buffer.