yarp 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +15 -1
  3. data/Makefile +5 -1
  4. data/config.yml +156 -125
  5. data/docs/encoding.md +5 -5
  6. data/docs/serialization.md +2 -2
  7. data/ext/yarp/api_node.c +142 -98
  8. data/ext/yarp/extension.c +21 -7
  9. data/ext/yarp/extension.h +1 -1
  10. data/include/yarp/ast.h +327 -18
  11. data/include/yarp/defines.h +2 -1
  12. data/include/yarp/diagnostic.h +3 -3
  13. data/include/yarp/enc/yp_encoding.h +10 -10
  14. data/include/yarp/parser.h +19 -19
  15. data/include/yarp/regexp.h +1 -1
  16. data/include/yarp/unescape.h +4 -4
  17. data/include/yarp/util/yp_buffer.h +3 -0
  18. data/include/yarp/util/yp_char.h +16 -16
  19. data/include/yarp/util/yp_constant_pool.h +2 -2
  20. data/include/yarp/util/yp_newline_list.h +5 -5
  21. data/include/yarp/util/yp_string.h +4 -4
  22. data/include/yarp/util/yp_string_list.h +0 -3
  23. data/include/yarp/util/yp_strpbrk.h +1 -1
  24. data/include/yarp/version.h +2 -2
  25. data/include/yarp.h +5 -4
  26. data/lib/yarp/desugar_visitor.rb +59 -122
  27. data/lib/yarp/node.rb +230 -240
  28. data/lib/yarp/serialize.rb +16 -16
  29. data/lib/yarp.rb +5 -5
  30. data/src/diagnostic.c +1 -1
  31. data/src/enc/yp_big5.c +15 -42
  32. data/src/enc/yp_euc_jp.c +16 -43
  33. data/src/enc/yp_gbk.c +19 -46
  34. data/src/enc/yp_shift_jis.c +16 -43
  35. data/src/enc/yp_tables.c +36 -38
  36. data/src/enc/yp_unicode.c +20 -25
  37. data/src/enc/yp_windows_31j.c +16 -43
  38. data/src/node.c +1271 -899
  39. data/src/prettyprint.c +87 -48
  40. data/src/regexp.c +21 -21
  41. data/src/serialize.c +28 -15
  42. data/src/unescape.c +151 -121
  43. data/src/util/yp_buffer.c +7 -2
  44. data/src/util/yp_char.c +34 -34
  45. data/src/util/yp_constant_pool.c +4 -4
  46. data/src/util/yp_memchr.c +1 -1
  47. data/src/util/yp_newline_list.c +5 -4
  48. data/src/util/yp_string.c +22 -20
  49. data/src/util/yp_string_list.c +0 -6
  50. data/src/util/yp_strncasecmp.c +3 -6
  51. data/src/util/yp_strpbrk.c +8 -8
  52. data/src/yarp.c +355 -216
  53. data/yarp.gemspec +1 -1
  54. metadata +2 -2
data/src/unescape.c CHANGED
@@ -5,9 +5,9 @@
5
5
  /******************************************************************************/
6
6
 
7
7
  static inline bool
8
- yp_char_is_hexadecimal_digits(const char *c, size_t length) {
8
+ yp_char_is_hexadecimal_digits(const uint8_t *string, size_t length) {
9
9
  for (size_t index = 0; index < length; index++) {
10
- if (!yp_char_is_hexadecimal_digit(c[index])) {
10
+ if (!yp_char_is_hexadecimal_digit(string[index])) {
11
11
  return false;
12
12
  }
13
13
  }
@@ -18,10 +18,8 @@ yp_char_is_hexadecimal_digits(const char *c, size_t length) {
18
18
  // expensive to go through the indirection of the function pointer. Instead we
19
19
  // provide a fast path that will check if we can just return 1.
20
20
  static inline size_t
21
- yp_char_width(yp_parser_t *parser, const char *start, const char *end) {
22
- const unsigned char *uc = (const unsigned char *) start;
23
-
24
- if (parser->encoding_changed || (*uc >= 0x80)) {
21
+ yp_char_width(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
22
+ if (parser->encoding_changed || (*start >= 0x80)) {
25
23
  return parser->encoding.char_width(start, end - start);
26
24
  } else {
27
25
  return 1;
@@ -33,7 +31,7 @@ yp_char_width(yp_parser_t *parser, const char *start, const char *end) {
33
31
  /******************************************************************************/
34
32
 
35
33
  // This is a lookup table for unescapes that only take up a single character.
36
- static const unsigned char unescape_chars[] = {
34
+ static const uint8_t unescape_chars[] = {
37
35
  ['\''] = '\'',
38
36
  ['\\'] = '\\',
39
37
  ['a'] = '\a',
@@ -60,9 +58,8 @@ static const bool ascii_printable_chars[] = {
60
58
  };
61
59
 
62
60
  static inline bool
63
- char_is_ascii_printable(const char c) {
64
- unsigned char v = (unsigned char) c;
65
- return (v < 0x80) && ascii_printable_chars[v];
61
+ char_is_ascii_printable(const uint8_t b) {
62
+ return (b < 0x80) && ascii_printable_chars[b];
66
63
  }
67
64
 
68
65
  /******************************************************************************/
@@ -72,37 +69,39 @@ char_is_ascii_printable(const char c) {
72
69
  // Scan the 1-3 digits of octal into the value. Returns the number of digits
73
70
  // scanned.
74
71
  static inline size_t
75
- unescape_octal(const char *backslash, unsigned char *value) {
76
- *value = (unsigned char) (backslash[1] - '0');
77
- if (!yp_char_is_octal_digit(backslash[2])) {
72
+ unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
73
+ *value = (uint8_t) (backslash[1] - '0');
74
+ if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
78
75
  return 2;
79
76
  }
80
-
81
- *value = (unsigned char) ((*value << 3) | (backslash[2] - '0'));
82
- if (!yp_char_is_octal_digit(backslash[3])) {
77
+ *value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
78
+ if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
83
79
  return 3;
84
80
  }
85
-
86
- *value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
81
+ *value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
87
82
  return 4;
88
83
  }
89
84
 
90
85
  // Convert a hexadecimal digit into its equivalent value.
91
- static inline unsigned char
92
- unescape_hexadecimal_digit(const char value) {
93
- return (unsigned char) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
86
+ static inline uint8_t
87
+ unescape_hexadecimal_digit(const uint8_t value) {
88
+ return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
94
89
  }
95
90
 
96
91
  // Scan the 1-2 digits of hexadecimal into the value. Returns the number of
97
92
  // digits scanned.
98
93
  static inline size_t
99
- unescape_hexadecimal(const char *backslash, unsigned char *value) {
94
+ unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end, yp_list_t *error_list) {
95
+ *value = 0;
96
+ if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
97
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid hex escape.");
98
+ return 2;
99
+ }
100
100
  *value = unescape_hexadecimal_digit(backslash[2]);
101
- if (!yp_char_is_hexadecimal_digit(backslash[3])) {
101
+ if (backslash + 3 >= end || !yp_char_is_hexadecimal_digit(backslash[3])) {
102
102
  return 3;
103
103
  }
104
-
105
- *value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
104
+ *value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
106
105
  return 4;
107
106
  }
108
107
 
@@ -110,7 +109,7 @@ unescape_hexadecimal(const char *backslash, unsigned char *value) {
110
109
  // digits scanned. This function assumes that the characters have already been
111
110
  // validated.
112
111
  static inline void
113
- unescape_unicode(const char *string, size_t length, uint32_t *value) {
112
+ unescape_unicode(const uint8_t *string, size_t length, uint32_t *value) {
114
113
  *value = 0;
115
114
  for (size_t index = 0; index < length; index++) {
116
115
  if (index != 0) *value <<= 4;
@@ -122,27 +121,25 @@ unescape_unicode(const char *string, size_t length, uint32_t *value) {
122
121
  // 32-bit value to write. Writes the UTF-8 representation of the value to the
123
122
  // string and returns the number of bytes written.
124
123
  static inline size_t
125
- unescape_unicode_write(char *dest, uint32_t value, const char *start, const char *end, yp_list_t *error_list) {
126
- unsigned char *bytes = (unsigned char *) dest;
127
-
124
+ unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, const uint8_t *end, yp_list_t *error_list) {
128
125
  if (value <= 0x7F) {
129
126
  // 0xxxxxxx
130
- bytes[0] = (unsigned char) value;
127
+ dest[0] = (uint8_t) value;
131
128
  return 1;
132
129
  }
133
130
 
134
131
  if (value <= 0x7FF) {
135
132
  // 110xxxxx 10xxxxxx
136
- bytes[0] = (unsigned char) (0xC0 | (value >> 6));
137
- bytes[1] = (unsigned char) (0x80 | (value & 0x3F));
133
+ dest[0] = (uint8_t) (0xC0 | (value >> 6));
134
+ dest[1] = (uint8_t) (0x80 | (value & 0x3F));
138
135
  return 2;
139
136
  }
140
137
 
141
138
  if (value <= 0xFFFF) {
142
139
  // 1110xxxx 10xxxxxx 10xxxxxx
143
- bytes[0] = (unsigned char) (0xE0 | (value >> 12));
144
- bytes[1] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
145
- bytes[2] = (unsigned char) (0x80 | (value & 0x3F));
140
+ dest[0] = (uint8_t) (0xE0 | (value >> 12));
141
+ dest[1] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
142
+ dest[2] = (uint8_t) (0x80 | (value & 0x3F));
146
143
  return 3;
147
144
  }
148
145
 
@@ -150,20 +147,20 @@ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char
150
147
  // the input is invalid.
151
148
  if (value <= 0x10FFFF) {
152
149
  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
153
- bytes[0] = (unsigned char) (0xF0 | (value >> 18));
154
- bytes[1] = (unsigned char) (0x80 | ((value >> 12) & 0x3F));
155
- bytes[2] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
156
- bytes[3] = (unsigned char) (0x80 | (value & 0x3F));
150
+ dest[0] = (uint8_t) (0xF0 | (value >> 18));
151
+ dest[1] = (uint8_t) (0x80 | ((value >> 12) & 0x3F));
152
+ dest[2] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
153
+ dest[3] = (uint8_t) (0x80 | (value & 0x3F));
157
154
  return 4;
158
155
  }
159
156
 
160
157
  // If we get here, then the value is too big. This is an error, but we don't
161
158
  // want to just crash, so instead we'll add an error to the error list and put
162
159
  // in a replacement character instead.
163
- yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
164
- bytes[0] = 0xEF;
165
- bytes[1] = 0xBF;
166
- bytes[2] = 0xBD;
160
+ if (error_list) yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
161
+ dest[0] = 0xEF;
162
+ dest[1] = 0xBF;
163
+ dest[2] = 0xBD;
167
164
  return 3;
168
165
  }
169
166
 
@@ -175,24 +172,30 @@ typedef enum {
175
172
  } yp_unescape_flag_t;
176
173
 
177
174
  // Unescape a single character value based on the given flags.
178
- static inline unsigned char
179
- unescape_char(const unsigned char value, const unsigned char flags) {
180
- unsigned char unescaped = value;
181
-
175
+ static inline uint8_t
176
+ unescape_char(uint8_t value, const uint8_t flags) {
182
177
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
183
- unescaped &= 0x1f;
178
+ value &= 0x1f;
184
179
  }
185
180
 
186
181
  if (flags & YP_UNESCAPE_FLAG_META) {
187
- unescaped |= 0x80;
182
+ value |= 0x80;
188
183
  }
189
184
 
190
- return unescaped;
185
+ return value;
191
186
  }
192
187
 
193
188
  // Read a specific escape sequence into the given destination.
194
- static const char *
195
- unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backslash, const char *end, const unsigned char flags, bool write_to_str) {
189
+ static const uint8_t *
190
+ unescape(
191
+ yp_parser_t *parser,
192
+ uint8_t *dest,
193
+ size_t *dest_length,
194
+ const uint8_t *backslash,
195
+ const uint8_t *end,
196
+ const uint8_t flags,
197
+ yp_list_t *error_list
198
+ ) {
196
199
  switch (backslash[1]) {
197
200
  case 'a':
198
201
  case 'b':
@@ -203,28 +206,28 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
203
206
  case 's':
204
207
  case 't':
205
208
  case 'v':
206
- if (write_to_str) {
207
- dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
209
+ if (dest) {
210
+ dest[(*dest_length)++] = unescape_char(unescape_chars[backslash[1]], flags);
208
211
  }
209
212
  return backslash + 2;
210
213
  // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
211
214
  case '0': case '1': case '2': case '3': case '4':
212
215
  case '5': case '6': case '7': case '8': case '9': {
213
- unsigned char value;
214
- const char *cursor = backslash + unescape_octal(backslash, &value);
216
+ uint8_t value;
217
+ const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
215
218
 
216
- if (write_to_str) {
217
- dest[(*dest_length)++] = (char) unescape_char(value, flags);
219
+ if (dest) {
220
+ dest[(*dest_length)++] = unescape_char(value, flags);
218
221
  }
219
222
  return cursor;
220
223
  }
221
224
  // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
222
225
  case 'x': {
223
- unsigned char value;
224
- const char *cursor = backslash + unescape_hexadecimal(backslash, &value);
226
+ uint8_t value;
227
+ const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end, error_list);
225
228
 
226
- if (write_to_str) {
227
- dest[(*dest_length)++] = (char) unescape_char(value, flags);
229
+ if (dest) {
230
+ dest[(*dest_length)++] = unescape_char(value, flags);
228
231
  }
229
232
  return cursor;
230
233
  }
@@ -232,28 +235,28 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
232
235
  // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
233
236
  case 'u': {
234
237
  if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
235
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
238
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
236
239
  return backslash + 2;
237
240
  }
238
241
 
239
242
  if ((backslash + 3) < end && backslash[2] == '{') {
240
- const char *unicode_cursor = backslash + 3;
241
- const char *extra_codepoints_start = NULL;
243
+ const uint8_t *unicode_cursor = backslash + 3;
244
+ const uint8_t *extra_codepoints_start = NULL;
242
245
  int codepoints_count = 0;
243
246
 
244
247
  unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
245
248
 
246
- while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
247
- const char *unicode_start = unicode_cursor;
249
+ while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
250
+ const uint8_t *unicode_start = unicode_cursor;
248
251
  size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
249
252
 
250
253
  // \u{nnnn} character literal allows only 1-6 hexadecimal digits
251
- if (hexadecimal_length > 6)
252
- yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
253
-
254
+ if (hexadecimal_length > 6) {
255
+ if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
256
+ }
254
257
  // there are not hexadecimal characters
255
- if (hexadecimal_length == 0) {
256
- yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
258
+ else if (hexadecimal_length == 0) {
259
+ if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
257
260
  return unicode_cursor;
258
261
  }
259
262
 
@@ -265,31 +268,37 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
265
268
 
266
269
  uint32_t value;
267
270
  unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
268
- if (write_to_str) {
269
- *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, &parser->error_list);
271
+ if (dest) {
272
+ *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
270
273
  }
271
274
 
272
275
  unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
273
276
  }
274
277
 
275
278
  // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
276
- if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
277
- yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
279
+ if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) {
280
+ if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
281
+ }
278
282
 
279
- return unicode_cursor + 1;
280
- }
283
+ if (unicode_cursor < end && *unicode_cursor == '}') {
284
+ unicode_cursor++;
285
+ } else {
286
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, "invalid Unicode escape.");
287
+ }
281
288
 
282
- if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
289
+ return unicode_cursor;
290
+ }
291
+ else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
283
292
  uint32_t value;
284
293
  unescape_unicode(backslash + 2, 4, &value);
285
294
 
286
- if (write_to_str) {
287
- *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, &parser->error_list);
295
+ if (dest) {
296
+ *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
288
297
  }
289
298
  return backslash + 6;
290
299
  }
291
300
 
292
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
301
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
293
302
  return backslash + 2;
294
303
  }
295
304
  // \c\M-x meta control character, where x is an ASCII printable character
@@ -297,31 +306,31 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
297
306
  // \cx control character, where x is an ASCII printable character
298
307
  case 'c':
299
308
  if (backslash + 2 >= end) {
300
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
309
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
301
310
  return end;
302
311
  }
303
312
 
304
313
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
305
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
314
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
306
315
  return backslash + 2;
307
316
  }
308
317
 
309
318
  switch (backslash[2]) {
310
319
  case '\\':
311
- return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
320
+ return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
312
321
  case '?':
313
- if (write_to_str) {
314
- dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
322
+ if (dest) {
323
+ dest[(*dest_length)++] = unescape_char(0x7f, flags);
315
324
  }
316
325
  return backslash + 3;
317
326
  default: {
318
327
  if (!char_is_ascii_printable(backslash[2])) {
319
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
328
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
320
329
  return backslash + 2;
321
330
  }
322
331
 
323
- if (write_to_str) {
324
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
332
+ if (dest) {
333
+ dest[(*dest_length)++] = unescape_char(backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
325
334
  }
326
335
  return backslash + 3;
327
336
  }
@@ -330,36 +339,36 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
330
339
  // \C-? delete, ASCII 7Fh (DEL)
331
340
  case 'C':
332
341
  if (backslash + 3 >= end) {
333
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
342
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
334
343
  return end;
335
344
  }
336
345
 
337
346
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
338
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
347
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
339
348
  return backslash + 2;
340
349
  }
341
350
 
342
351
  if (backslash[2] != '-') {
343
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
352
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
344
353
  return backslash + 2;
345
354
  }
346
355
 
347
356
  switch (backslash[3]) {
348
357
  case '\\':
349
- return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
358
+ return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
350
359
  case '?':
351
- if (write_to_str) {
352
- dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
360
+ if (dest) {
361
+ dest[(*dest_length)++] = unescape_char(0x7f, flags);
353
362
  }
354
363
  return backslash + 4;
355
364
  default:
356
365
  if (!char_is_ascii_printable(backslash[3])) {
357
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid control escape sequence");
366
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
358
367
  return backslash + 2;
359
368
  }
360
369
 
361
- if (write_to_str) {
362
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
370
+ if (dest) {
371
+ dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
363
372
  }
364
373
  return backslash + 4;
365
374
  }
@@ -368,32 +377,32 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
368
377
  // \M-x meta character, where x is an ASCII printable character
369
378
  case 'M': {
370
379
  if (backslash + 3 >= end) {
371
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
380
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
372
381
  return end;
373
382
  }
374
383
 
375
384
  if (flags & YP_UNESCAPE_FLAG_META) {
376
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
385
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
377
386
  return backslash + 2;
378
387
  }
379
388
 
380
389
  if (backslash[2] != '-') {
381
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
390
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
382
391
  return backslash + 2;
383
392
  }
384
393
 
385
394
  if (backslash[3] == '\\') {
386
- return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, write_to_str);
395
+ return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list);
387
396
  }
388
397
 
389
398
  if (char_is_ascii_printable(backslash[3])) {
390
- if (write_to_str) {
391
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_META);
399
+ if (dest) {
400
+ dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_META);
392
401
  }
393
402
  return backslash + 4;
394
403
  }
395
404
 
396
- yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
405
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
397
406
  return backslash + 3;
398
407
  }
399
408
  // \n
@@ -409,7 +418,7 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
409
418
  default: {
410
419
  size_t width = yp_char_width(parser, backslash + 1, end);
411
420
 
412
- if (write_to_str) {
421
+ if (dest) {
413
422
  memcpy(dest + *dest_length, backslash + 1, width);
414
423
  *dest_length += width;
415
424
  }
@@ -447,14 +456,14 @@ unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backs
447
456
  // \c\M-x same as above
448
457
  // \c? or \C-? delete, ASCII 7Fh (DEL)
449
458
  //
450
- YP_EXPORTED_FUNCTION void
451
- yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
459
+ static void
460
+ yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
452
461
  if (unescape_type == YP_UNESCAPE_NONE) {
453
462
  // If we're not unescaping then we can reference the source directly.
454
463
  return;
455
464
  }
456
465
 
457
- const char *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
466
+ const uint8_t *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
458
467
 
459
468
  if (backslash == NULL) {
460
469
  // Here there are no escapes, so we can reference the source directly.
@@ -463,21 +472,21 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
463
472
 
464
473
  // Here we have found an escape character, so we need to handle all escapes
465
474
  // within the string.
466
- char *allocated = malloc(string->length);
475
+ uint8_t *allocated = malloc(string->length);
467
476
  if (allocated == NULL) {
468
477
  yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
469
478
  return;
470
479
  }
471
480
 
472
481
  // This is the memory address where we're putting the unescaped string.
473
- char *dest = allocated;
482
+ uint8_t *dest = allocated;
474
483
  size_t dest_length = 0;
475
484
 
476
485
  // This is the current position in the source string that we're looking at.
477
486
  // It's going to move along behind the backslash so that we can copy each
478
487
  // segment of the string that doesn't contain an escape.
479
- const char *cursor = string->source;
480
- const char *end = string->source + string->length;
488
+ const uint8_t *cursor = string->source;
489
+ const uint8_t *end = string->source + string->length;
481
490
 
482
491
  // For each escape found in the source string, we will handle it and update
483
492
  // the moving cursor->backslash window.
@@ -496,7 +505,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
496
505
  switch (backslash[1]) {
497
506
  case '\\':
498
507
  case '\'':
499
- dest[dest_length++] = (char) unescape_chars[(unsigned char) backslash[1]];
508
+ dest[dest_length++] = unescape_chars[backslash[1]];
500
509
  cursor = backslash + 2;
501
510
  break;
502
511
  default:
@@ -510,7 +519,13 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
510
519
  // This is the only type of unescaping left. In this case we need to
511
520
  // handle all of the different unescapes.
512
521
  assert(unescape_type == YP_UNESCAPE_ALL);
513
- cursor = unescape(parser, dest, &dest_length, backslash, end, YP_UNESCAPE_FLAG_NONE, true);
522
+
523
+ uint8_t flags = YP_UNESCAPE_FLAG_NONE;
524
+ if (expect_single_codepoint) {
525
+ flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
526
+ }
527
+
528
+ cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list);
514
529
  break;
515
530
  }
516
531
 
@@ -538,13 +553,27 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
538
553
  yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
539
554
  }
540
555
 
556
+ YP_EXPORTED_FUNCTION void
557
+ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
558
+ yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false);
559
+ }
560
+
561
+ void
562
+ yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
563
+ yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true);
564
+ }
565
+
541
566
  // This function is similar to yp_unescape_manipulate_string, except it doesn't
542
567
  // actually perform any string manipulations. Instead, it calculates how long
543
568
  // the unescaped character is, and returns that value
544
569
  size_t
545
- yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
570
+ yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
546
571
  assert(unescape_type != YP_UNESCAPE_NONE);
547
572
 
573
+ if (backslash + 1 >= parser->end) {
574
+ return 0;
575
+ }
576
+
548
577
  switch (backslash[1]) {
549
578
  case '\\':
550
579
  case '\'':
@@ -558,11 +587,12 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_
558
587
  // handle all of the different unescapes.
559
588
  assert(unescape_type == YP_UNESCAPE_ALL);
560
589
 
561
- unsigned char flags = YP_UNESCAPE_FLAG_NONE;
562
- if (expect_single_codepoint)
590
+ uint8_t flags = YP_UNESCAPE_FLAG_NONE;
591
+ if (expect_single_codepoint) {
563
592
  flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
593
+ }
564
594
 
565
- const char *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, false);
595
+ const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL);
566
596
  assert(cursor > backslash);
567
597
 
568
598
  return (size_t) (cursor - backslash);
@@ -574,7 +604,7 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_
574
604
  // string, a type of unescaping, and a pointer to a result string. It returns a
575
605
  // boolean indicating whether or not the unescaping was successful.
576
606
  YP_EXPORTED_FUNCTION bool
577
- yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
607
+ yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
578
608
  yp_parser_t parser;
579
609
  yp_parser_init(&parser, start, length, NULL);
580
610
 
data/src/util/yp_buffer.c CHANGED
@@ -63,8 +63,13 @@ yp_buffer_append_zeroes(yp_buffer_t *buffer, size_t length) {
63
63
  // Append a string to the buffer.
64
64
  void
65
65
  yp_buffer_append_str(yp_buffer_t *buffer, const char *value, size_t length) {
66
- const void *source = value;
67
- yp_buffer_append(buffer, source, length);
66
+ yp_buffer_append(buffer, value, length);
67
+ }
68
+
69
+ // Append a list of bytes to the buffer.
70
+ void
71
+ yp_buffer_append_bytes(yp_buffer_t *buffer, const uint8_t *value, size_t length) {
72
+ yp_buffer_append(buffer, (const char *) value, length);
68
73
  }
69
74
 
70
75
  // Append a single byte to the buffer.