yarp 0.8.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +48 -1
  3. data/Makefile +5 -1
  4. data/README.md +4 -3
  5. data/config.yml +461 -150
  6. data/docs/configuration.md +1 -0
  7. data/docs/encoding.md +5 -5
  8. data/docs/ruby_api.md +2 -0
  9. data/docs/serialization.md +3 -3
  10. data/docs/testing.md +2 -2
  11. data/ext/yarp/api_node.c +810 -199
  12. data/ext/yarp/extension.c +94 -31
  13. data/ext/yarp/extension.h +2 -2
  14. data/include/yarp/ast.h +653 -150
  15. data/include/yarp/defines.h +2 -1
  16. data/include/yarp/diagnostic.h +3 -3
  17. data/include/yarp/enc/yp_encoding.h +10 -10
  18. data/include/yarp/node.h +10 -0
  19. data/include/yarp/parser.h +19 -19
  20. data/include/yarp/regexp.h +1 -1
  21. data/include/yarp/unescape.h +7 -5
  22. data/include/yarp/util/yp_buffer.h +3 -0
  23. data/include/yarp/util/yp_char.h +16 -16
  24. data/include/yarp/util/yp_constant_pool.h +2 -2
  25. data/include/yarp/util/yp_newline_list.h +7 -4
  26. data/include/yarp/util/yp_string.h +4 -4
  27. data/include/yarp/util/yp_string_list.h +0 -3
  28. data/include/yarp/util/yp_strpbrk.h +1 -1
  29. data/include/yarp/version.h +2 -2
  30. data/include/yarp.h +14 -3
  31. data/lib/yarp/desugar_visitor.rb +204 -0
  32. data/lib/yarp/ffi.rb +27 -1
  33. data/lib/yarp/lex_compat.rb +93 -25
  34. data/lib/yarp/mutation_visitor.rb +683 -0
  35. data/lib/yarp/node.rb +3121 -597
  36. data/lib/yarp/serialize.rb +198 -126
  37. data/lib/yarp.rb +53 -7
  38. data/src/diagnostic.c +1 -1
  39. data/src/enc/yp_big5.c +15 -42
  40. data/src/enc/yp_euc_jp.c +16 -43
  41. data/src/enc/yp_gbk.c +19 -46
  42. data/src/enc/yp_shift_jis.c +16 -43
  43. data/src/enc/yp_tables.c +36 -38
  44. data/src/enc/yp_unicode.c +20 -25
  45. data/src/enc/yp_windows_31j.c +16 -43
  46. data/src/node.c +1444 -836
  47. data/src/prettyprint.c +324 -103
  48. data/src/regexp.c +21 -21
  49. data/src/serialize.c +429 -276
  50. data/src/token_type.c +2 -2
  51. data/src/unescape.c +184 -136
  52. data/src/util/yp_buffer.c +7 -2
  53. data/src/util/yp_char.c +34 -34
  54. data/src/util/yp_constant_pool.c +4 -4
  55. data/src/util/yp_memchr.c +1 -1
  56. data/src/util/yp_newline_list.c +14 -3
  57. data/src/util/yp_string.c +22 -20
  58. data/src/util/yp_string_list.c +0 -6
  59. data/src/util/yp_strncasecmp.c +3 -6
  60. data/src/util/yp_strpbrk.c +8 -8
  61. data/src/yarp.c +1504 -615
  62. data/yarp.gemspec +3 -1
  63. metadata +4 -2
data/src/token_type.c CHANGED
@@ -1,6 +1,6 @@
1
1
  /******************************************************************************/
2
- /* This file is generated by the bin/template script and should not be */
3
- /* modified manually. See */
2
+ /* This file is generated by the templates/template.rb script and should not */
3
+ /* be modified manually. See */
4
4
  /* templates/src/token_type.c.erb */
5
5
  /* if you are looking to modify the */
6
6
  /* template */
data/src/unescape.c CHANGED
@@ -5,21 +5,33 @@
5
5
  /******************************************************************************/
6
6
 
7
7
  static inline bool
8
- yp_char_is_hexadecimal_digits(const char *c, size_t length) {
8
+ yp_char_is_hexadecimal_digits(const uint8_t *string, size_t length) {
9
9
  for (size_t index = 0; index < length; index++) {
10
- if (!yp_char_is_hexadecimal_digit(c[index])) {
10
+ if (!yp_char_is_hexadecimal_digit(string[index])) {
11
11
  return false;
12
12
  }
13
13
  }
14
14
  return true;
15
15
  }
16
16
 
17
+ // We don't call the char_width function unless we have to because it's
18
+ // expensive to go through the indirection of the function pointer. Instead we
19
+ // provide a fast path that will check if we can just return 1.
20
+ static inline size_t
21
+ yp_char_width(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
22
+ if (parser->encoding_changed || (*start >= 0x80)) {
23
+ return parser->encoding.char_width(start, end - start);
24
+ } else {
25
+ return 1;
26
+ }
27
+ }
28
+
17
29
  /******************************************************************************/
18
30
  /* Lookup tables for characters */
19
31
  /******************************************************************************/
20
32
 
21
33
  // This is a lookup table for unescapes that only take up a single character.
22
- static const unsigned char unescape_chars[] = {
34
+ static const uint8_t unescape_chars[] = {
23
35
  ['\''] = '\'',
24
36
  ['\\'] = '\\',
25
37
  ['a'] = '\a',
@@ -46,9 +58,8 @@ static const bool ascii_printable_chars[] = {
46
58
  };
47
59
 
48
60
  static inline bool
49
- char_is_ascii_printable(const char c) {
50
- unsigned char v = (unsigned char) c;
51
- return (v < 0x80) && ascii_printable_chars[v];
61
+ char_is_ascii_printable(const uint8_t b) {
62
+ return (b < 0x80) && ascii_printable_chars[b];
52
63
  }
53
64
 
54
65
  /******************************************************************************/
@@ -58,37 +69,39 @@ char_is_ascii_printable(const char c) {
58
69
  // Scan the 1-3 digits of octal into the value. Returns the number of digits
59
70
  // scanned.
60
71
  static inline size_t
61
- unescape_octal(const char *backslash, unsigned char *value) {
62
- *value = (unsigned char) (backslash[1] - '0');
63
- if (!yp_char_is_octal_digit(backslash[2])) {
72
+ unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
73
+ *value = (uint8_t) (backslash[1] - '0');
74
+ if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
64
75
  return 2;
65
76
  }
66
-
67
- *value = (unsigned char) ((*value << 3) | (backslash[2] - '0'));
68
- if (!yp_char_is_octal_digit(backslash[3])) {
77
+ *value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
78
+ if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
69
79
  return 3;
70
80
  }
71
-
72
- *value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
81
+ *value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
73
82
  return 4;
74
83
  }
75
84
 
76
85
  // Convert a hexadecimal digit into its equivalent value.
77
- static inline unsigned char
78
- unescape_hexadecimal_digit(const char value) {
79
- return (unsigned char) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
86
+ static inline uint8_t
87
+ unescape_hexadecimal_digit(const uint8_t value) {
88
+ return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
80
89
  }
81
90
 
82
91
  // Scan the 1-2 digits of hexadecimal into the value. Returns the number of
83
92
  // digits scanned.
84
93
  static inline size_t
85
- unescape_hexadecimal(const char *backslash, unsigned char *value) {
94
+ unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end, yp_list_t *error_list) {
95
+ *value = 0;
96
+ if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
97
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid hex escape.");
98
+ return 2;
99
+ }
86
100
  *value = unescape_hexadecimal_digit(backslash[2]);
87
- if (!yp_char_is_hexadecimal_digit(backslash[3])) {
101
+ if (backslash + 3 >= end || !yp_char_is_hexadecimal_digit(backslash[3])) {
88
102
  return 3;
89
103
  }
90
-
91
- *value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
104
+ *value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
92
105
  return 4;
93
106
  }
94
107
 
@@ -96,7 +109,7 @@ unescape_hexadecimal(const char *backslash, unsigned char *value) {
96
109
  // digits scanned. This function assumes that the characters have already been
97
110
  // validated.
98
111
  static inline void
99
- unescape_unicode(const char *string, size_t length, uint32_t *value) {
112
+ unescape_unicode(const uint8_t *string, size_t length, uint32_t *value) {
100
113
  *value = 0;
101
114
  for (size_t index = 0; index < length; index++) {
102
115
  if (index != 0) *value <<= 4;
@@ -108,27 +121,25 @@ unescape_unicode(const char *string, size_t length, uint32_t *value) {
108
121
  // 32-bit value to write. Writes the UTF-8 representation of the value to the
109
122
  // string and returns the number of bytes written.
110
123
  static inline size_t
111
- unescape_unicode_write(char *dest, uint32_t value, const char *start, const char *end, yp_list_t *error_list) {
112
- unsigned char *bytes = (unsigned char *) dest;
113
-
124
+ unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, const uint8_t *end, yp_list_t *error_list) {
114
125
  if (value <= 0x7F) {
115
126
  // 0xxxxxxx
116
- bytes[0] = (unsigned char) value;
127
+ dest[0] = (uint8_t) value;
117
128
  return 1;
118
129
  }
119
130
 
120
131
  if (value <= 0x7FF) {
121
132
  // 110xxxxx 10xxxxxx
122
- bytes[0] = (unsigned char) (0xC0 | (value >> 6));
123
- bytes[1] = (unsigned char) (0x80 | (value & 0x3F));
133
+ dest[0] = (uint8_t) (0xC0 | (value >> 6));
134
+ dest[1] = (uint8_t) (0x80 | (value & 0x3F));
124
135
  return 2;
125
136
  }
126
137
 
127
138
  if (value <= 0xFFFF) {
128
139
  // 1110xxxx 10xxxxxx 10xxxxxx
129
- bytes[0] = (unsigned char) (0xE0 | (value >> 12));
130
- bytes[1] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
131
- bytes[2] = (unsigned char) (0x80 | (value & 0x3F));
140
+ dest[0] = (uint8_t) (0xE0 | (value >> 12));
141
+ dest[1] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
142
+ dest[2] = (uint8_t) (0x80 | (value & 0x3F));
132
143
  return 3;
133
144
  }
134
145
 
@@ -136,20 +147,20 @@ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char
136
147
  // the input is invalid.
137
148
  if (value <= 0x10FFFF) {
138
149
  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
139
- bytes[0] = (unsigned char) (0xF0 | (value >> 18));
140
- bytes[1] = (unsigned char) (0x80 | ((value >> 12) & 0x3F));
141
- bytes[2] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
142
- bytes[3] = (unsigned char) (0x80 | (value & 0x3F));
150
+ dest[0] = (uint8_t) (0xF0 | (value >> 18));
151
+ dest[1] = (uint8_t) (0x80 | ((value >> 12) & 0x3F));
152
+ dest[2] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
153
+ dest[3] = (uint8_t) (0x80 | (value & 0x3F));
143
154
  return 4;
144
155
  }
145
156
 
146
157
  // If we get here, then the value is too big. This is an error, but we don't
147
158
  // want to just crash, so instead we'll add an error to the error list and put
148
159
  // in a replacement character instead.
149
- yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
150
- bytes[0] = 0xEF;
151
- bytes[1] = 0xBF;
152
- bytes[2] = 0xBD;
160
+ if (error_list) yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
161
+ dest[0] = 0xEF;
162
+ dest[1] = 0xBF;
163
+ dest[2] = 0xBD;
153
164
  return 3;
154
165
  }
155
166
 
@@ -161,24 +172,30 @@ typedef enum {
161
172
  } yp_unescape_flag_t;
162
173
 
163
174
  // Unescape a single character value based on the given flags.
164
- static inline unsigned char
165
- unescape_char(const unsigned char value, const unsigned char flags) {
166
- unsigned char unescaped = value;
167
-
175
+ static inline uint8_t
176
+ unescape_char(uint8_t value, const uint8_t flags) {
168
177
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
169
- unescaped &= 0x1f;
178
+ value &= 0x1f;
170
179
  }
171
180
 
172
181
  if (flags & YP_UNESCAPE_FLAG_META) {
173
- unescaped |= 0x80;
182
+ value |= 0x80;
174
183
  }
175
184
 
176
- return unescaped;
185
+ return value;
177
186
  }
178
187
 
179
188
  // Read a specific escape sequence into the given destination.
180
- static const char *
181
- unescape(char *dest, size_t *dest_length, const char *backslash, const char *end, yp_list_t *error_list, const unsigned char flags, bool write_to_str) {
189
+ static const uint8_t *
190
+ unescape(
191
+ yp_parser_t *parser,
192
+ uint8_t *dest,
193
+ size_t *dest_length,
194
+ const uint8_t *backslash,
195
+ const uint8_t *end,
196
+ const uint8_t flags,
197
+ yp_list_t *error_list
198
+ ) {
182
199
  switch (backslash[1]) {
183
200
  case 'a':
184
201
  case 'b':
@@ -189,28 +206,28 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
189
206
  case 's':
190
207
  case 't':
191
208
  case 'v':
192
- if (write_to_str) {
193
- dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
209
+ if (dest) {
210
+ dest[(*dest_length)++] = unescape_char(unescape_chars[backslash[1]], flags);
194
211
  }
195
212
  return backslash + 2;
196
213
  // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
197
214
  case '0': case '1': case '2': case '3': case '4':
198
215
  case '5': case '6': case '7': case '8': case '9': {
199
- unsigned char value;
200
- const char *cursor = backslash + unescape_octal(backslash, &value);
216
+ uint8_t value;
217
+ const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
201
218
 
202
- if (write_to_str) {
203
- dest[(*dest_length)++] = (char) unescape_char(value, flags);
219
+ if (dest) {
220
+ dest[(*dest_length)++] = unescape_char(value, flags);
204
221
  }
205
222
  return cursor;
206
223
  }
207
224
  // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
208
225
  case 'x': {
209
- unsigned char value;
210
- const char *cursor = backslash + unescape_hexadecimal(backslash, &value);
226
+ uint8_t value;
227
+ const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end, error_list);
211
228
 
212
- if (write_to_str) {
213
- dest[(*dest_length)++] = (char) unescape_char(value, flags);
229
+ if (dest) {
230
+ dest[(*dest_length)++] = unescape_char(value, flags);
214
231
  }
215
232
  return cursor;
216
233
  }
@@ -218,28 +235,28 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
218
235
  // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
219
236
  case 'u': {
220
237
  if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
221
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
238
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
222
239
  return backslash + 2;
223
240
  }
224
241
 
225
242
  if ((backslash + 3) < end && backslash[2] == '{') {
226
- const char *unicode_cursor = backslash + 3;
227
- const char *extra_codepoints_start = NULL;
243
+ const uint8_t *unicode_cursor = backslash + 3;
244
+ const uint8_t *extra_codepoints_start = NULL;
228
245
  int codepoints_count = 0;
229
246
 
230
247
  unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
231
248
 
232
- while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
233
- const char *unicode_start = unicode_cursor;
249
+ while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
250
+ const uint8_t *unicode_start = unicode_cursor;
234
251
  size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
235
252
 
236
253
  // \u{nnnn} character literal allows only 1-6 hexadecimal digits
237
- if (hexadecimal_length > 6)
238
- yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
239
-
254
+ if (hexadecimal_length > 6) {
255
+ if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
256
+ }
240
257
  // there are not hexadecimal characters
241
- if (hexadecimal_length == 0) {
242
- yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
258
+ else if (hexadecimal_length == 0) {
259
+ if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
243
260
  return unicode_cursor;
244
261
  }
245
262
 
@@ -251,7 +268,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
251
268
 
252
269
  uint32_t value;
253
270
  unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
254
- if (write_to_str) {
271
+ if (dest) {
255
272
  *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
256
273
  }
257
274
 
@@ -259,23 +276,29 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
259
276
  }
260
277
 
261
278
  // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
262
- if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
263
- yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
279
+ if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) {
280
+ if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
281
+ }
264
282
 
265
- return unicode_cursor + 1;
266
- }
283
+ if (unicode_cursor < end && *unicode_cursor == '}') {
284
+ unicode_cursor++;
285
+ } else {
286
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, "invalid Unicode escape.");
287
+ }
267
288
 
268
- if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
289
+ return unicode_cursor;
290
+ }
291
+ else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
269
292
  uint32_t value;
270
293
  unescape_unicode(backslash + 2, 4, &value);
271
294
 
272
- if (write_to_str) {
295
+ if (dest) {
273
296
  *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
274
297
  }
275
298
  return backslash + 6;
276
299
  }
277
300
 
278
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
301
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
279
302
  return backslash + 2;
280
303
  }
281
304
  // \c\M-x meta control character, where x is an ASCII printable character
@@ -283,31 +306,31 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
283
306
  // \cx control character, where x is an ASCII printable character
284
307
  case 'c':
285
308
  if (backslash + 2 >= end) {
286
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
309
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
287
310
  return end;
288
311
  }
289
312
 
290
313
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
291
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
314
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
292
315
  return backslash + 2;
293
316
  }
294
317
 
295
318
  switch (backslash[2]) {
296
319
  case '\\':
297
- return unescape(dest, dest_length, backslash + 2, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
320
+ return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
298
321
  case '?':
299
- if (write_to_str) {
300
- dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
322
+ if (dest) {
323
+ dest[(*dest_length)++] = unescape_char(0x7f, flags);
301
324
  }
302
325
  return backslash + 3;
303
326
  default: {
304
327
  if (!char_is_ascii_printable(backslash[2])) {
305
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
328
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
306
329
  return backslash + 2;
307
330
  }
308
331
 
309
- if (write_to_str) {
310
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
332
+ if (dest) {
333
+ dest[(*dest_length)++] = unescape_char(backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
311
334
  }
312
335
  return backslash + 3;
313
336
  }
@@ -316,36 +339,36 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
316
339
  // \C-? delete, ASCII 7Fh (DEL)
317
340
  case 'C':
318
341
  if (backslash + 3 >= end) {
319
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
342
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
320
343
  return end;
321
344
  }
322
345
 
323
346
  if (flags & YP_UNESCAPE_FLAG_CONTROL) {
324
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
347
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
325
348
  return backslash + 2;
326
349
  }
327
350
 
328
351
  if (backslash[2] != '-') {
329
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
352
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
330
353
  return backslash + 2;
331
354
  }
332
355
 
333
356
  switch (backslash[3]) {
334
357
  case '\\':
335
- return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
358
+ return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
336
359
  case '?':
337
- if (write_to_str) {
338
- dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
360
+ if (dest) {
361
+ dest[(*dest_length)++] = unescape_char(0x7f, flags);
339
362
  }
340
363
  return backslash + 4;
341
364
  default:
342
365
  if (!char_is_ascii_printable(backslash[3])) {
343
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
366
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
344
367
  return backslash + 2;
345
368
  }
346
369
 
347
- if (write_to_str) {
348
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
370
+ if (dest) {
371
+ dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
349
372
  }
350
373
  return backslash + 4;
351
374
  }
@@ -354,32 +377,32 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
354
377
  // \M-x meta character, where x is an ASCII printable character
355
378
  case 'M': {
356
379
  if (backslash + 3 >= end) {
357
- yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
380
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
358
381
  return end;
359
382
  }
360
383
 
361
384
  if (flags & YP_UNESCAPE_FLAG_META) {
362
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
385
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
363
386
  return backslash + 2;
364
387
  }
365
388
 
366
389
  if (backslash[2] != '-') {
367
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
390
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
368
391
  return backslash + 2;
369
392
  }
370
393
 
371
394
  if (backslash[3] == '\\') {
372
- return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_META, write_to_str);
395
+ return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list);
373
396
  }
374
397
 
375
398
  if (char_is_ascii_printable(backslash[3])) {
376
- if (write_to_str) {
377
- dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_META);
399
+ if (dest) {
400
+ dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_META);
378
401
  }
379
402
  return backslash + 4;
380
403
  }
381
404
 
382
- yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
405
+ if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
383
406
  return backslash + 3;
384
407
  }
385
408
  // \n
@@ -390,14 +413,17 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
390
413
  if (backslash + 2 < end && backslash[2] == '\n') {
391
414
  return backslash + 3;
392
415
  }
393
-
394
- /* fallthrough */
416
+ /* fallthrough */
395
417
  // In this case we're escaping something that doesn't need escaping.
396
418
  default: {
397
- if (write_to_str) {
398
- dest[(*dest_length)++] = backslash[1];
419
+ size_t width = yp_char_width(parser, backslash + 1, end);
420
+
421
+ if (dest) {
422
+ memcpy(dest + *dest_length, backslash + 1, width);
423
+ *dest_length += width;
399
424
  }
400
- return backslash + 2;
425
+
426
+ return backslash + 1 + width;
401
427
  }
402
428
  }
403
429
  }
@@ -430,14 +456,14 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
430
456
  // \c\M-x same as above
431
457
  // \c? or \C-? delete, ASCII 7Fh (DEL)
432
458
  //
433
- YP_EXPORTED_FUNCTION void
434
- yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list) {
459
+ static void
460
+ yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
435
461
  if (unescape_type == YP_UNESCAPE_NONE) {
436
462
  // If we're not unescaping then we can reference the source directly.
437
463
  return;
438
464
  }
439
465
 
440
- const char *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
466
+ const uint8_t *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);
441
467
 
442
468
  if (backslash == NULL) {
443
469
  // Here there are no escapes, so we can reference the source directly.
@@ -446,21 +472,21 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
446
472
 
447
473
  // Here we have found an escape character, so we need to handle all escapes
448
474
  // within the string.
449
- char *allocated = malloc(string->length);
475
+ uint8_t *allocated = malloc(string->length);
450
476
  if (allocated == NULL) {
451
- yp_diagnostic_list_append(error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
477
+ yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
452
478
  return;
453
479
  }
454
480
 
455
481
  // This is the memory address where we're putting the unescaped string.
456
- char *dest = allocated;
482
+ uint8_t *dest = allocated;
457
483
  size_t dest_length = 0;
458
484
 
459
485
  // This is the current position in the source string that we're looking at.
460
486
  // It's going to move along behind the backslash so that we can copy each
461
487
  // segment of the string that doesn't contain an escape.
462
- const char *cursor = string->source;
463
- const char *end = string->source + string->length;
488
+ const uint8_t *cursor = string->source;
489
+ const uint8_t *end = string->source + string->length;
464
490
 
465
491
  // For each escape found in the source string, we will handle it and update
466
492
  // the moving cursor->backslash window.
@@ -479,7 +505,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
479
505
  switch (backslash[1]) {
480
506
  case '\\':
481
507
  case '\'':
482
- dest[dest_length++] = (char) unescape_chars[(unsigned char) backslash[1]];
508
+ dest[dest_length++] = unescape_chars[backslash[1]];
483
509
  cursor = backslash + 2;
484
510
  break;
485
511
  default:
@@ -493,7 +519,13 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
493
519
  // This is the only type of unescaping left. In this case we need to
494
520
  // handle all of the different unescapes.
495
521
  assert(unescape_type == YP_UNESCAPE_ALL);
496
- cursor = unescape(dest, &dest_length, backslash, end, error_list, YP_UNESCAPE_FLAG_NONE, true);
522
+
523
+ uint8_t flags = YP_UNESCAPE_FLAG_NONE;
524
+ if (expect_single_codepoint) {
525
+ flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
526
+ }
527
+
528
+ cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list);
497
529
  break;
498
530
  }
499
531
 
@@ -521,50 +553,66 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
521
553
  yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
522
554
  }
523
555
 
524
- YP_EXPORTED_FUNCTION bool
525
- yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
526
- bool success;
527
-
528
- yp_parser_t parser;
529
- yp_parser_init(&parser, start, length, "");
530
-
531
- yp_list_t error_list = YP_LIST_EMPTY;
532
- yp_string_shared_init(result, start, start + length);
533
- yp_unescape_manipulate_string(&parser, result, unescape_type, &error_list);
534
- success = yp_list_empty_p(&error_list);
535
-
536
- yp_list_free(&error_list);
537
- yp_parser_free(&parser);
556
+ YP_EXPORTED_FUNCTION void
557
+ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
558
+ yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false);
559
+ }
538
560
 
539
- return success;
561
+ void
562
+ yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
563
+ yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true);
540
564
  }
541
565
 
542
566
  // This function is similar to yp_unescape_manipulate_string, except it doesn't
543
567
  // actually perform any string manipulations. Instead, it calculates how long
544
568
  // the unescaped character is, and returns that value
545
- YP_EXPORTED_FUNCTION size_t
546
- yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list) {
569
+ size_t
570
+ yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
547
571
  assert(unescape_type != YP_UNESCAPE_NONE);
548
572
 
573
+ if (backslash + 1 >= parser->end) {
574
+ return 0;
575
+ }
576
+
549
577
  switch (backslash[1]) {
550
578
  case '\\':
551
579
  case '\'':
552
580
  return 2;
553
581
  default: {
554
- if (unescape_type == YP_UNESCAPE_MINIMAL) return 2;
582
+ if (unescape_type == YP_UNESCAPE_MINIMAL) {
583
+ return 1 + yp_char_width(parser, backslash + 1, parser->end);
584
+ }
555
585
 
556
586
  // This is the only type of unescaping left. In this case we need to
557
587
  // handle all of the different unescapes.
558
588
  assert(unescape_type == YP_UNESCAPE_ALL);
559
589
 
560
- unsigned char flags = YP_UNESCAPE_FLAG_NONE;
561
- if (expect_single_codepoint)
590
+ uint8_t flags = YP_UNESCAPE_FLAG_NONE;
591
+ if (expect_single_codepoint) {
562
592
  flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
593
+ }
563
594
 
564
- const char *cursor = unescape(NULL, 0, backslash, end, error_list, flags, false);
595
+ const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL);
565
596
  assert(cursor > backslash);
566
597
 
567
598
  return (size_t) (cursor - backslash);
568
599
  }
569
600
  }
570
601
  }
602
+
603
+ // This is one of the main entry points into the extension. It accepts a source
604
+ // string, a type of unescaping, and a pointer to a result string. It returns a
605
+ // boolean indicating whether or not the unescaping was successful.
606
+ YP_EXPORTED_FUNCTION bool
607
+ yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
608
+ yp_parser_t parser;
609
+ yp_parser_init(&parser, start, length, NULL);
610
+
611
+ yp_string_shared_init(result, start, start + length);
612
+ yp_unescape_manipulate_string(&parser, result, unescape_type);
613
+
614
+ bool success = yp_list_empty_p(&parser.error_list);
615
+ yp_parser_free(&parser);
616
+
617
+ return success;
618
+ }