yarp 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +48 -1
  3. data/Makefile +5 -1
  4. data/README.md +4 -3
  5. data/config.yml +461 -150
  6. data/docs/configuration.md +1 -0
  7. data/docs/encoding.md +5 -5
  8. data/docs/ruby_api.md +2 -0
  9. data/docs/serialization.md +3 -3
  10. data/docs/testing.md +2 -2
  11. data/ext/yarp/api_node.c +810 -199
  12. data/ext/yarp/extension.c +94 -31
  13. data/ext/yarp/extension.h +2 -2
  14. data/include/yarp/ast.h +653 -150
  15. data/include/yarp/defines.h +2 -1
  16. data/include/yarp/diagnostic.h +3 -3
  17. data/include/yarp/enc/yp_encoding.h +10 -10
  18. data/include/yarp/node.h +10 -0
  19. data/include/yarp/parser.h +19 -19
  20. data/include/yarp/regexp.h +1 -1
  21. data/include/yarp/unescape.h +7 -5
  22. data/include/yarp/util/yp_buffer.h +3 -0
  23. data/include/yarp/util/yp_char.h +16 -16
  24. data/include/yarp/util/yp_constant_pool.h +2 -2
  25. data/include/yarp/util/yp_newline_list.h +7 -4
  26. data/include/yarp/util/yp_string.h +4 -4
  27. data/include/yarp/util/yp_string_list.h +0 -3
  28. data/include/yarp/util/yp_strpbrk.h +1 -1
  29. data/include/yarp/version.h +2 -2
  30. data/include/yarp.h +14 -3
  31. data/lib/yarp/desugar_visitor.rb +204 -0
  32. data/lib/yarp/ffi.rb +27 -1
  33. data/lib/yarp/lex_compat.rb +93 -25
  34. data/lib/yarp/mutation_visitor.rb +683 -0
  35. data/lib/yarp/node.rb +3121 -597
  36. data/lib/yarp/serialize.rb +198 -126
  37. data/lib/yarp.rb +53 -7
  38. data/src/diagnostic.c +1 -1
  39. data/src/enc/yp_big5.c +15 -42
  40. data/src/enc/yp_euc_jp.c +16 -43
  41. data/src/enc/yp_gbk.c +19 -46
  42. data/src/enc/yp_shift_jis.c +16 -43
  43. data/src/enc/yp_tables.c +36 -38
  44. data/src/enc/yp_unicode.c +20 -25
  45. data/src/enc/yp_windows_31j.c +16 -43
  46. data/src/node.c +1444 -836
  47. data/src/prettyprint.c +324 -103
  48. data/src/regexp.c +21 -21
  49. data/src/serialize.c +429 -276
  50. data/src/token_type.c +2 -2
  51. data/src/unescape.c +184 -136
  52. data/src/util/yp_buffer.c +7 -2
  53. data/src/util/yp_char.c +34 -34
  54. data/src/util/yp_constant_pool.c +4 -4
  55. data/src/util/yp_memchr.c +1 -1
  56. data/src/util/yp_newline_list.c +14 -3
  57. data/src/util/yp_string.c +22 -20
  58. data/src/util/yp_string_list.c +0 -6
  59. data/src/util/yp_strncasecmp.c +3 -6
  60. data/src/util/yp_strpbrk.c +8 -8
  61. data/src/yarp.c +1504 -615
  62. data/yarp.gemspec +3 -1
  63. metadata +4 -2
data/src/token_type.c CHANGED
@@ -1,6 +1,6 @@
1
1
  /******************************************************************************/
2
- /* This file is generated by the bin/template script and should not be */
3
- /* modified manually. See */
2
+ /* This file is generated by the templates/template.rb script and should not */
3
+ /* be modified manually. See */
4
4
  /* templates/src/token_type.c.erb */
5
5
  /* if you are looking to modify the */
6
6
  /* template */
data/src/unescape.c CHANGED
@@ -5,21 +5,33 @@
5
5
  /******************************************************************************/
6
6
 
7
7
  static inline bool
8
- yp_char_is_hexadecimal_digits(const char *c, size_t length) {
8
+ yp_char_is_hexadecimal_digits(const uint8_t *string, size_t length) {
9
9
  for (size_t index = 0; index < length; index++) {
10
- if (!yp_char_is_hexadecimal_digit(c[index])) {
10
+ if (!yp_char_is_hexadecimal_digit(string[index])) {
11
11
  return false;
12
12
  }
13
13
  }
14
14
  return true;
15
15
  }
16
16
 
17
+ // We don't call the char_width function unless we have to because it's
18
+ // expensive to go through the indirection of the function pointer. Instead we
19
+ // provide a fast path that will check if we can just return 1.
20
+ static inline size_t
21
+ yp_char_width(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
22
+ if (parser->encoding_changed || (*start >= 0x80)) {
23
+ return parser->encoding.char_width(start, end - start);
24
+ } else {
25
+ return 1;
26
+ }
27
+ }
28
+
17
29
  /******************************************************************************/
18
30
  /* Lookup tables for characters */
19
31
  /******************************************************************************/
20
32
 
21
33
  // This is a lookup table for unescapes that only take up a single character.
22
- static const unsigned char unescape_chars[] = {
34
+ static const uint8_t unescape_chars[] = {
23
35
  ['\''] = '\'',
24
36
  ['\\'] = '\\',
25
37
  ['a'] = '\a',
@@ -46,9 +58,8 @@ static const bool ascii_printable_chars[] = {
46
58
  };
47
59
 
48
60
  static inline bool
49
- char_is_ascii_printable(const char c) {
50
- unsigned char v = (unsigned char) c;
51
- return (v < 0x80) && ascii_printable_chars[v];
61
+ char_is_ascii_printable(const uint8_t b) {
62
+ return (b < 0x80) && ascii_printable_chars[b];
52
63
  }
53
64
 
54
65
  /******************************************************************************/
@@ -58,37 +69,39 @@ char_is_ascii_printable(const char c) {
58
69
  // Scan the 1-3 digits of octal into the value. Returns the number of digits
59
70
  // scanned.
60
71
  static inline size_t
61
- unescape_octal(const char *backslash, unsigned char *value) {
62
- *value = (unsigned char) (backslash[1] - '0');
63
- if (!yp_char_is_octal_digit(backslash[2])) {
72
+ unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
73
+ *value = (uint8_t) (backslash[1] - '0');
74
+ if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
64
75
  return 2;
65
76
  }
66
-
67
- *value = (unsigned char) ((*value << 3) | (backslash[2] - '0'));
68
- if (!yp_char_is_octal_digit(backslash[3])) {
77
+ *value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
78
+ if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
69
79
  return 3;
70
80
  }
71
-
72
- *value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
81
+ *value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
73
82
  return 4;
74
83
  }
75
84
 
76
85
  // Convert a hexadecimal digit into its equivalent value.
77
- static inline unsigned char
78
- unescape_hexadecimal_digit(const char value) {
79
- return (unsigned char) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
86
+ static inline uint8_t
87
+ unescape_hexadecimal_digit(const uint8_t value) {
88
+ return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
80
89
  }
81
90
 
82
91
  // Scan the 1-2 digits of hexadecimal into the value. Returns the number of
83
92
  // digits scanned.
84
93
  static inline size_t
85
- unescape_hexadecimal(const char *backslash, unsigned char *value) {
94
+ unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end, yp_list_t *error_list) {
95
+ *value = 0;
96
+ if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
97
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid hex escape.");
98
+ return 2;
99
+ }
86
100
  *value = unescape_hexadecimal_digit(backslash[2]);
87
- if (!yp_char_is_hexadecimal_digit(backslash[3])) {
101
+ if (backslash + 3 >= end || !yp_char_is_hexadecimal_digit(backslash[3])) {
88
102
  return 3;
89
103
  }
90
-
91
- *value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
104
+ *value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
92
105
  return 4;
93
106
  }
94
107
 
@@ -96,7 +109,7 @@ unescape_hexadecimal(const char *backslash, unsigned char *value) {
96
109
  // digits scanned. This function assumes that the characters have already been
97
110
  // validated.
98
111
  static inline void
99
- unescape_unicode(const char *string, size_t length, uint32_t *value) {
112
+ unescape_unicode(const uint8_t *string, size_t length, uint32_t *value) {
100
113
  *value = 0;
101
114
  for (size_t index = 0; index < length; index++) {
102
115
  if (index != 0) *value <<= 4;
@@ -108,27 +121,25 @@ unescape_unicode(const char *string, size_t length, uint32_t *value) {
108
121
  // 32-bit value to write. Writes the UTF-8 representation of the value to the
109
122
  // string and returns the number of bytes written.
110
123
  static inline size_t
111
- unescape_unicode_write(char *dest, uint32_t value, const char *start, const char *end, yp_list_t *error_list) {
112
- unsigned char *bytes = (unsigned char *) dest;
113
-
124
+ unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, const uint8_t *end, yp_list_t *error_list) {
114
125
  if (value <= 0x7F) {
115
126
  // 0xxxxxxx
116
- bytes[0] = (unsigned char) value;
127
+ dest[0] = (uint8_t) value;
117
128
  return 1;
118
129
  }
119
130
 
120
131
  if (value <= 0x7FF) {
121
132
  // 110xxxxx 10xxxxxx
122
- bytes[0] = (unsigned char) (0xC0 | (value >> 6));
123
- bytes[1] = (unsigned char) (0x80 | (value & 0x3F));
133
+ dest[0] = (uint8_t) (0xC0 | (value >> 6));
134
+ dest[1] = (uint8_t) (0x80 | (value & 0x3F));
124
135
  return 2;
125
136
  }
126
137
 
127
138
  if (value <= 0xFFFF) {
128
139
  // 1110xxxx 10xxxxxx 10xxxxxx
129
- bytes[0] = (unsigned char) (0xE0 | (value >> 12));
130
- bytes[1] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
131
- bytes[2] = (unsigned char) (0x80 | (value & 0x3F));
140
+ dest[0] = (uint8_t) (0xE0 | (value >> 12));
141
+ dest[1] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
142
+ dest[2] = (uint8_t) (0x80 | (value & 0x3F));
132
143
  return 3;
133
144
  }
134
145
 
@@ -136,20 +147,20 @@ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char
136
147
  // the input is invalid.
137
148
  if (value <= 0x10FFFF) {
138
149
  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
139
- bytes[0] = (unsigned char) (0xF0 | (value >> 18));
140
- bytes[1] = (unsigned char) (0x80 | ((value >> 12) & 0x3F));
141
- bytes[2] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
142
- bytes[3] = (unsigned char) (0x80 | (value & 0x3F));
150
+ dest[0] = (uint8_t) (0xF0 | (value >> 18));
151
+ dest[1] = (uint8_t) (0x80 | ((value >> 12) & 0x3F));
152
+ dest[2] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
153
+ dest[3] = (uint8_t) (0x80 | (value & 0x3F));
143
154
  return 4;
144
155
  }
145
156
 
146
157
  // If we get here, then the value is too big. This is an error, but we don't
147
158
  // want to just crash, so instead we'll add an error to the error list and put
148
159
  // in a replacement character instead.
149
- yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
150
- bytes[0] = 0xEF;
151
- bytes[1] = 0xBF;
152
- bytes[2] = 0xBD;
160
+ if (error_list) yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
161
+ dest[0] = 0xEF;
162
+ dest[1] = 0xBF;
163
+ dest[2] = 0xBD;
153
164
  return 3;
154
165
  }
155
166
 
@@ -161,24 +172,30 @@ typedef enum {
161
172
  } yp_unescape_flag_t;
162
173
 
163
174
  // Unescape a single character value based on the given flags.
164
- static inline unsigned char
165
- unescape_char(const unsigned char value, const unsigned char flags) {
166
- unsigned char unescaped = value;
167
-
175
+ static inline uint8_t
176
+ unescape_char(uint8_t value, const uint8_t flags) {
168
177
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
169
- unescaped &= 0x1f;
178
+ value &= 0x1f;
170
179
  }
171
180
 
172
181
  if (flags & YP_UNESCAPE_FLAG_META) {
173
- unescaped |= 0x80;
182
+ value |= 0x80;
174
183
  }
175
184
 
176
- return unescaped;
185
+ return value;
177
186
  }
178
187
 
179
188
  // Read a specific escape sequence into the given destination.
180
- static const char *
181
- unescape(char *dest, size_t *dest_length, const char *backslash, const char *end, yp_list_t *error_list, const unsigned char flags, bool write_to_str) {
189
+ static const uint8_t *
190
+ unescape(
191
+ yp_parser_t *parser,
192
+ uint8_t *dest,
193
+ size_t *dest_length,
194
+ const uint8_t *backslash,
195
+ const uint8_t *end,
196
+ const uint8_t flags,
197
+ yp_list_t *error_list
198
+ ) {
182
199
  switch (backslash[1]) {
183
200
  case 'a':
184
201
  case 'b':
@@ -189,28 +206,28 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
189
206
  case 's':
190
207
  case 't':
191
208
  case 'v':
192
- if (write_to_str) {
193
- dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
209
+ if (dest) {
210
+ dest[(*dest_length)++] = unescape_char(unescape_chars[backslash[1]], flags);
194
211
  }
195
212
  return backslash + 2;
196
213
  // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
197
214
  case '0': case '1': case '2': case '3': case '4':
198
215
  case '5': case '6': case '7': case '8': case '9': {
199
- unsigned char value;
200
- const char *cursor = backslash + unescape_octal(backslash, &value);
216
+ uint8_t value;
217
+ const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
201
218
 
202
- if (write_to_str) {
203
- dest[(*dest_length)++] = (char) unescape_char(value, flags);
219
+ if (dest) {
220
+ dest[(*dest_length)++] = unescape_char(value, flags);
204
221
  }
205
222
  return cursor;
206
223
  }
207
224
  // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
208
225
  case 'x': {
209
- unsigned char value;
210
- const char *cursor = backslash + unescape_hexadecimal(backslash, &value);
226
+ uint8_t value;
227
+ const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end, error_list);
211
228
 
212
- if (write_to_str) {
213
- dest[(*dest_length)++] = (char) unescape_char(value, flags);
229
+ if (dest) {
230
+ dest[(*dest_length)++] = unescape_char(value, flags);
214
231
  }
215
232
  return cursor;
216
233
  }
@@ -218,28 +235,28 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
218
235
  // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
219
236
  case 'u': {
220
237
  if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
221
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
238
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
222
239
  return backslash + 2;
223
240
  }
224
241
 
225
242
  if ((backslash + 3) < end && backslash[2] == '{') {
226
- const char *unicode_cursor = backslash + 3;
227
- const char *extra_codepoints_start = NULL;
243
+ const uint8_t *unicode_cursor = backslash + 3;
244
+ const uint8_t *extra_codepoints_start = NULL;
228
245
  int codepoints_count = 0;
229
246
 
230
247
  unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
231
248
 
232
- while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
233
- const char *unicode_start = unicode_cursor;
249
+ while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
250
+ const uint8_t *unicode_start = unicode_cursor;
234
251
  size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
235
252
 
236
253
  // \u{nnnn} character literal allows only 1-6 hexadecimal digits
237
- if (hexadecimal_length > 6)
238
- yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
239
-
254
+ if (hexadecimal_length > 6) {
255
+ if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
256
+ }
240
257
  // there are not hexadecimal characters
241
- if (hexadecimal_length == 0) {
242
- yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
258
+ else if (hexadecimal_length == 0) {
259
+ if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
243
260
  return unicode_cursor;
244
261
  }
245
262
 
@@ -251,7 +268,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
251
268
 
252
269
  uint32_t value;
253
270
  unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
254
- if (write_to_str) {
271
+ if (dest) {
255
272
  *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
256
273
  }
257
274
 
@@ -259,23 +276,29 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
259
276
  }
260
277
 
261
278
  // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
262
- if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
263
- yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
279
+ if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) {
280
+ if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
281
+ }
264
282
 
265
- return unicode_cursor + 1;
266
- }
283
+ if (unicode_cursor < end && *unicode_cursor == '}') {
284
+ unicode_cursor++;
285
+ } else {
286
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, "invalid Unicode escape.");
287
+ }
267
288
 
268
- if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
289
+ return unicode_cursor;
290
+ }
291
+ else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
269
292
  uint32_t value;
270
293
  unescape_unicode(backslash + 2, 4, &value);
271
294
 
272
- if (write_to_str) {
295
+ if (dest) {
273
296
  *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
274
297
  }
275
298
  return backslash + 6;
276
299
  }
277
300
 
278
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
301
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
279
302
  return backslash + 2;
280
303
  }
281
304
  // \c\M-x meta control character, where x is an ASCII printable character
@@ -283,31 +306,31 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
283
306
  // \cx control character, where x is an ASCII printable character
284
307
  case 'c':
285
308
  if (backslash + 2 >= end) {
286
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
309
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
287
310
  return end;
288
311
  }
289
312
 
290
313
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
291
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
314
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
292
315
  return backslash + 2;
293
316
  }
294
317
 
295
318
  switch (backslash[2]) {
296
319
  case '\\':
297
- return unescape(dest, dest_length, backslash + 2, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
320
+ return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
298
321
  case '?':
299
- if (write_to_str) {
300
- dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
322
+ if (dest) {
323
+ dest[(*dest_length)++] = unescape_char(0x7f, flags);
301
324
  }
302
325
  return backslash + 3;
303
326
  default: {
304
327
  if (!char_is_ascii_printable(backslash[2])) {
305
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
328
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
306
329
  return backslash + 2;
307
330
  }
308
331
 
309
- if (write_to_str) {
310
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
332
+ if (dest) {
333
+ dest[(*dest_length)++] = unescape_char(backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
311
334
  }
312
335
  return backslash + 3;
313
336
  }
@@ -316,36 +339,36 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
316
339
  // \C-? delete, ASCII 7Fh (DEL)
317
340
  case 'C':
318
341
  if (backslash + 3 >= end) {
319
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
342
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
320
343
  return end;
321
344
  }
322
345
 
323
346
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
324
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
347
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
325
348
  return backslash + 2;
326
349
  }
327
350
 
328
351
  if (backslash[2] != '-') {
329
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
352
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
330
353
  return backslash + 2;
331
354
  }
332
355
 
333
356
  switch (backslash[3]) {
334
357
  case '\\':
335
- return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
358
+ return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
336
359
  case '?':
337
- if (write_to_str) {
338
- dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
360
+ if (dest) {
361
+ dest[(*dest_length)++] = unescape_char(0x7f, flags);
339
362
  }
340
363
  return backslash + 4;
341
364
  default:
342
365
  if (!char_is_ascii_printable(backslash[3])) {
343
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
366
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
344
367
  return backslash + 2;
345
368
  }
346
369
 
347
- if (write_to_str) {
348
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
370
+ if (dest) {
371
+ dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
349
372
  }
350
373
  return backslash + 4;
351
374
  }
@@ -354,32 +377,32 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
354
377
  // \M-x meta character, where x is an ASCII printable character
355
378
  case 'M': {
356
379
  if (backslash + 3 >= end) {
357
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
380
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
358
381
  return end;
359
382
  }
360
383
 
361
384
  if (flags & YP_UNESCAPE_FLAG_META) {
362
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
385
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
363
386
  return backslash + 2;
364
387
  }
365
388
 
366
389
  if (backslash[2] != '-') {
367
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
390
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
368
391
  return backslash + 2;
369
392
  }
370
393
 
371
394
  if (backslash[3] == '\\') {
372
- return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_META, write_to_str);
395
+ return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list);
373
396
  }
374
397
 
375
398
  if (char_is_ascii_printable(backslash[3])) {
376
- if (write_to_str) {
377
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_META);
399
+ if (dest) {
400
+ dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_META);
378
401
  }
379
402
  return backslash + 4;
380
403
  }
381
404
 
382
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
405
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
383
406
  return backslash + 3;
384
407
  }
385
408
  // \n
@@ -390,14 +413,17 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
390
413
  if (backslash + 2 < end && backslash[2] == '\n') {
391
414
  return backslash + 3;
392
415
  }
393
-
394
- /* fallthrough */
416
+ /* fallthrough */
395
417
  // In this case we're escaping something that doesn't need escaping.
396
418
  default: {
397
- if (write_to_str) {
398
- dest[(*dest_length)++] = backslash[1];
419
+ size_t width = yp_char_width(parser, backslash + 1, end);
420
+
421
+ if (dest) {
422
+ memcpy(dest + *dest_length, backslash + 1, width);
423
+ *dest_length += width;
399
424
  }
400
- return backslash + 2;
425
+
426
+ return backslash + 1 + width;
401
427
  }
402
428
  }
403
429
  }
@@ -430,14 +456,14 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
430
456
  // \c\M-x same as above
431
457
  // \c? or \C-? delete, ASCII 7Fh (DEL)
432
458
  //
433
- YP_EXPORTED_FUNCTION void
434
- yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list) {
459
+ static void
460
+ yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
435
461
  if (unescape_type == YP_UNESCAPE_NONE) {
436
462
  // If we're not unescaping then we can reference the source directly.
437
463
  return;
438
464
  }
439
465
 
440
- const char *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
466
+ const uint8_t *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
441
467
 
442
468
  if (backslash == NULL) {
443
469
  // Here there are no escapes, so we can reference the source directly.
@@ -446,21 +472,21 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
446
472
 
447
473
  // Here we have found an escape character, so we need to handle all escapes
448
474
  // within the string.
449
- char *allocated = malloc(string->length);
475
+ uint8_t *allocated = malloc(string->length);
450
476
  if (allocated == NULL) {
451
- yp_diagnostic_list_append(error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
477
+ yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
452
478
  return;
453
479
  }
454
480
 
455
481
  // This is the memory address where we're putting the unescaped string.
456
- char *dest = allocated;
482
+ uint8_t *dest = allocated;
457
483
  size_t dest_length = 0;
458
484
 
459
485
  // This is the current position in the source string that we're looking at.
460
486
  // It's going to move along behind the backslash so that we can copy each
461
487
  // segment of the string that doesn't contain an escape.
462
- const char *cursor = string->source;
463
- const char *end = string->source + string->length;
488
+ const uint8_t *cursor = string->source;
489
+ const uint8_t *end = string->source + string->length;
464
490
 
465
491
  // For each escape found in the source string, we will handle it and update
466
492
  // the moving cursor->backslash window.
@@ -479,7 +505,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
479
505
  switch (backslash[1]) {
480
506
  case '\\':
481
507
  case '\'':
482
- dest[dest_length++] = (char) unescape_chars[(unsigned char) backslash[1]];
508
+ dest[dest_length++] = unescape_chars[backslash[1]];
483
509
  cursor = backslash + 2;
484
510
  break;
485
511
  default:
@@ -493,7 +519,13 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
493
519
  // This is the only type of unescaping left. In this case we need to
494
520
  // handle all of the different unescapes.
495
521
  assert(unescape_type == YP_UNESCAPE_ALL);
496
- cursor = unescape(dest, &dest_length, backslash, end, error_list, YP_UNESCAPE_FLAG_NONE, true);
522
+
523
+ uint8_t flags = YP_UNESCAPE_FLAG_NONE;
524
+ if (expect_single_codepoint) {
525
+ flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
526
+ }
527
+
528
+ cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list);
497
529
  break;
498
530
  }
499
531
 
@@ -521,50 +553,66 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
521
553
  yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
522
554
  }
523
555
 
524
- YP_EXPORTED_FUNCTION bool
525
- yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
526
- bool success;
527
-
528
- yp_parser_t parser;
529
- yp_parser_init(&parser, start, length, "");
530
-
531
- yp_list_t error_list = YP_LIST_EMPTY;
532
- yp_string_shared_init(result, start, start + length);
533
- yp_unescape_manipulate_string(&parser, result, unescape_type, &error_list);
534
- success = yp_list_empty_p(&error_list);
535
-
536
- yp_list_free(&error_list);
537
- yp_parser_free(&parser);
556
+ YP_EXPORTED_FUNCTION void
557
+ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
558
+ yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false);
559
+ }
538
560
 
539
- return success;
561
+ void
562
+ yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
563
+ yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true);
540
564
  }
541
565
 
542
566
  // This function is similar to yp_unescape_manipulate_string, except it doesn't
543
567
  // actually perform any string manipulations. Instead, it calculates how long
544
568
  // the unescaped character is, and returns that value
545
- YP_EXPORTED_FUNCTION size_t
546
- yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list) {
569
+ size_t
570
+ yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
547
571
  assert(unescape_type != YP_UNESCAPE_NONE);
548
572
 
573
+ if (backslash + 1 >= parser->end) {
574
+ return 0;
575
+ }
576
+
549
577
  switch (backslash[1]) {
550
578
  case '\\':
551
579
  case '\'':
552
580
  return 2;
553
581
  default: {
554
- if (unescape_type == YP_UNESCAPE_MINIMAL) return 2;
582
+ if (unescape_type == YP_UNESCAPE_MINIMAL) {
583
+ return 1 + yp_char_width(parser, backslash + 1, parser->end);
584
+ }
555
585
 
556
586
  // This is the only type of unescaping left. In this case we need to
557
587
  // handle all of the different unescapes.
558
588
  assert(unescape_type == YP_UNESCAPE_ALL);
559
589
 
560
- unsigned char flags = YP_UNESCAPE_FLAG_NONE;
561
- if (expect_single_codepoint)
590
+ uint8_t flags = YP_UNESCAPE_FLAG_NONE;
591
+ if (expect_single_codepoint) {
562
592
  flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
593
+ }
563
594
 
564
- const char *cursor = unescape(NULL, 0, backslash, end, error_list, flags, false);
595
+ const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL);
565
596
  assert(cursor > backslash);
566
597
 
567
598
  return (size_t) (cursor - backslash);
568
599
  }
569
600
  }
570
601
  }
602
+
603
+ // This is one of the main entry points into the extension. It accepts a source
604
+ // string, a type of unescaping, and a pointer to a result string. It returns a
605
+ // boolean indicating whether or not the unescaping was successful.
606
+ YP_EXPORTED_FUNCTION bool
607
+ yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
608
+ yp_parser_t parser;
609
+ yp_parser_init(&parser, start, length, NULL);
610
+
611
+ yp_string_shared_init(result, start, start + length);
612
+ yp_unescape_manipulate_string(&parser, result, unescape_type);
613
+
614
+ bool success = yp_list_empty_p(&parser.error_list);
615
+ yp_parser_free(&parser);
616
+
617
+ return success;
618
+ }