yarp 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +76 -0
  3. data/CONTRIBUTING.md +51 -0
  4. data/LICENSE.md +7 -0
  5. data/Makefile.in +79 -0
  6. data/README.md +86 -0
  7. data/config.h.in +25 -0
  8. data/config.yml +2147 -0
  9. data/configure +4487 -0
  10. data/docs/build_system.md +85 -0
  11. data/docs/building.md +26 -0
  12. data/docs/configuration.md +56 -0
  13. data/docs/design.md +53 -0
  14. data/docs/encoding.md +116 -0
  15. data/docs/extension.md +20 -0
  16. data/docs/fuzzing.md +93 -0
  17. data/docs/heredocs.md +36 -0
  18. data/docs/mapping.md +117 -0
  19. data/docs/ripper.md +36 -0
  20. data/docs/serialization.md +130 -0
  21. data/docs/testing.md +55 -0
  22. data/ext/yarp/api_node.c +3680 -0
  23. data/ext/yarp/api_pack.c +256 -0
  24. data/ext/yarp/extconf.rb +131 -0
  25. data/ext/yarp/extension.c +547 -0
  26. data/ext/yarp/extension.h +18 -0
  27. data/include/yarp/ast.h +1412 -0
  28. data/include/yarp/defines.h +54 -0
  29. data/include/yarp/diagnostic.h +24 -0
  30. data/include/yarp/enc/yp_encoding.h +94 -0
  31. data/include/yarp/node.h +36 -0
  32. data/include/yarp/pack.h +141 -0
  33. data/include/yarp/parser.h +389 -0
  34. data/include/yarp/regexp.h +19 -0
  35. data/include/yarp/unescape.h +42 -0
  36. data/include/yarp/util/yp_buffer.h +39 -0
  37. data/include/yarp/util/yp_char.h +75 -0
  38. data/include/yarp/util/yp_constant_pool.h +64 -0
  39. data/include/yarp/util/yp_list.h +67 -0
  40. data/include/yarp/util/yp_memchr.h +14 -0
  41. data/include/yarp/util/yp_newline_list.h +54 -0
  42. data/include/yarp/util/yp_state_stack.h +24 -0
  43. data/include/yarp/util/yp_string.h +57 -0
  44. data/include/yarp/util/yp_string_list.h +28 -0
  45. data/include/yarp/util/yp_strpbrk.h +29 -0
  46. data/include/yarp/version.h +5 -0
  47. data/include/yarp.h +69 -0
  48. data/lib/yarp/lex_compat.rb +759 -0
  49. data/lib/yarp/node.rb +7428 -0
  50. data/lib/yarp/pack.rb +185 -0
  51. data/lib/yarp/ripper_compat.rb +174 -0
  52. data/lib/yarp/serialize.rb +389 -0
  53. data/lib/yarp.rb +330 -0
  54. data/src/diagnostic.c +25 -0
  55. data/src/enc/yp_big5.c +79 -0
  56. data/src/enc/yp_euc_jp.c +85 -0
  57. data/src/enc/yp_gbk.c +88 -0
  58. data/src/enc/yp_shift_jis.c +83 -0
  59. data/src/enc/yp_tables.c +509 -0
  60. data/src/enc/yp_unicode.c +2320 -0
  61. data/src/enc/yp_windows_31j.c +83 -0
  62. data/src/node.c +2011 -0
  63. data/src/pack.c +493 -0
  64. data/src/prettyprint.c +1782 -0
  65. data/src/regexp.c +580 -0
  66. data/src/serialize.c +1576 -0
  67. data/src/token_type.c +347 -0
  68. data/src/unescape.c +576 -0
  69. data/src/util/yp_buffer.c +78 -0
  70. data/src/util/yp_char.c +229 -0
  71. data/src/util/yp_constant_pool.c +147 -0
  72. data/src/util/yp_list.c +50 -0
  73. data/src/util/yp_memchr.c +31 -0
  74. data/src/util/yp_newline_list.c +119 -0
  75. data/src/util/yp_state_stack.c +25 -0
  76. data/src/util/yp_string.c +207 -0
  77. data/src/util/yp_string_list.c +32 -0
  78. data/src/util/yp_strncasecmp.c +20 -0
  79. data/src/util/yp_strpbrk.c +66 -0
  80. data/src/yarp.c +13211 -0
  81. data/yarp.gemspec +100 -0
  82. metadata +125 -0
data/src/unescape.c ADDED
@@ -0,0 +1,576 @@
1
+ #include "yarp.h"
2
+
3
+ /******************************************************************************/
4
+ /* Character checks */
5
+ /******************************************************************************/
6
+
7
+ static inline bool
8
+ yp_char_is_hexadecimal_digits(const char *c, size_t length) {
9
+ for (size_t index = 0; index < length; index++) {
10
+ if (!yp_char_is_hexadecimal_digit(c[index])) {
11
+ return false;
12
+ }
13
+ }
14
+ return true;
15
+ }
16
+
17
+ /******************************************************************************/
18
+ /* Lookup tables for characters */
19
+ /******************************************************************************/
20
+
21
+ // This is a lookup table for unescapes that only take up a single character.
22
+ static const unsigned char unescape_chars[] = {
23
+ ['\''] = '\'',
24
+ ['\\'] = '\\',
25
+ ['a'] = '\a',
26
+ ['b'] = '\b',
27
+ ['e'] = '\033',
28
+ ['f'] = '\f',
29
+ ['n'] = '\n',
30
+ ['r'] = '\r',
31
+ ['s'] = ' ',
32
+ ['t'] = '\t',
33
+ ['v'] = '\v'
34
+ };
35
+
36
+ // This is a lookup table for whether or not an ASCII character is printable.
37
+ static const bool ascii_printable_chars[] = {
38
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
39
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
44
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
46
+ };
47
+
48
+ static inline bool
49
+ char_is_ascii_printable(const char c) {
50
+ unsigned char v = (unsigned char) c;
51
+ return (v < 0x80) && ascii_printable_chars[v];
52
+ }
53
+
54
+ /******************************************************************************/
55
+ /* Unescaping for segments */
56
+ /******************************************************************************/
57
+
58
+ // Scan the 1-3 digits of octal into the value. Returns the number of digits
59
+ // scanned.
60
+ static inline size_t
61
+ unescape_octal(const char *backslash, unsigned char *value) {
62
+ *value = (unsigned char) (backslash[1] - '0');
63
+ if (!yp_char_is_octal_digit(backslash[2])) {
64
+ return 2;
65
+ }
66
+
67
+ *value = (unsigned char) ((*value << 3) | (backslash[2] - '0'));
68
+ if (!yp_char_is_octal_digit(backslash[3])) {
69
+ return 3;
70
+ }
71
+
72
+ *value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
73
+ return 4;
74
+ }
75
+
76
+ // Convert a hexadecimal digit into its equivalent value.
77
+ static inline unsigned char
78
+ unescape_hexadecimal_digit(const char value) {
79
+ return (unsigned char) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
80
+ }
81
+
82
+ // Scan the 1-2 digits of hexadecimal into the value. Returns the number of
83
+ // digits scanned.
84
+ static inline size_t
85
+ unescape_hexadecimal(const char *backslash, unsigned char *value) {
86
+ *value = unescape_hexadecimal_digit(backslash[2]);
87
+ if (!yp_char_is_hexadecimal_digit(backslash[3])) {
88
+ return 3;
89
+ }
90
+
91
+ *value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
92
+ return 4;
93
+ }
94
+
95
+ // Scan the 4 digits of a Unicode escape into the value. Returns the number of
96
+ // digits scanned. This function assumes that the characters have already been
97
+ // validated.
98
+ static inline void
99
+ unescape_unicode(const char *string, size_t length, uint32_t *value) {
100
+ *value = 0;
101
+ for (size_t index = 0; index < length; index++) {
102
+ if (index != 0) *value <<= 4;
103
+ *value |= unescape_hexadecimal_digit(string[index]);
104
+ }
105
+ }
106
+
107
+ // Accepts the pointer to the string to write the unicode value along with the
108
+ // 32-bit value to write. Writes the UTF-8 representation of the value to the
109
+ // string and returns the number of bytes written.
110
+ static inline size_t
111
+ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char *end, yp_list_t *error_list) {
112
+ unsigned char *bytes = (unsigned char *) dest;
113
+
114
+ if (value <= 0x7F) {
115
+ // 0xxxxxxx
116
+ bytes[0] = (unsigned char) value;
117
+ return 1;
118
+ }
119
+
120
+ if (value <= 0x7FF) {
121
+ // 110xxxxx 10xxxxxx
122
+ bytes[0] = (unsigned char) (0xC0 | (value >> 6));
123
+ bytes[1] = (unsigned char) (0x80 | (value & 0x3F));
124
+ return 2;
125
+ }
126
+
127
+ if (value <= 0xFFFF) {
128
+ // 1110xxxx 10xxxxxx 10xxxxxx
129
+ bytes[0] = (unsigned char) (0xE0 | (value >> 12));
130
+ bytes[1] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
131
+ bytes[2] = (unsigned char) (0x80 | (value & 0x3F));
132
+ return 3;
133
+ }
134
+
135
+ // At this point it must be a 4 digit UTF-8 representation. If it's not, then
136
+ // the input is invalid.
137
+ if (value <= 0x10FFFF) {
138
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
139
+ bytes[0] = (unsigned char) (0xF0 | (value >> 18));
140
+ bytes[1] = (unsigned char) (0x80 | ((value >> 12) & 0x3F));
141
+ bytes[2] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
142
+ bytes[3] = (unsigned char) (0x80 | (value & 0x3F));
143
+ return 4;
144
+ }
145
+
146
+ // If we get here, then the value is too big. This is an error, but we don't
147
+ // want to just crash, so instead we'll add an error to the error list and put
148
+ // in a replacement character instead.
149
+ yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
150
+ bytes[0] = 0xEF;
151
+ bytes[1] = 0xBF;
152
+ bytes[2] = 0xBD;
153
+ return 3;
154
+ }
155
+
156
+ typedef enum {
157
+ YP_UNESCAPE_FLAG_NONE = 0,
158
+ YP_UNESCAPE_FLAG_CONTROL = 1,
159
+ YP_UNESCAPE_FLAG_META = 2,
160
+ YP_UNESCAPE_FLAG_EXPECT_SINGLE = 4
161
+ } yp_unescape_flag_t;
162
+
163
+ // Unescape a single character value based on the given flags.
164
+ static inline unsigned char
165
+ unescape_char(const unsigned char value, const unsigned char flags) {
166
+ unsigned char unescaped = value;
167
+
168
+ if (flags & YP_UNESCAPE_FLAG_CONTROL) {
169
+ unescaped &= 0x1f;
170
+ }
171
+
172
+ if (flags & YP_UNESCAPE_FLAG_META) {
173
+ unescaped |= 0x80;
174
+ }
175
+
176
+ return unescaped;
177
+ }
178
+
179
+ // Read a specific escape sequence into the given destination.
180
+ static const char *
181
+ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end, yp_list_t *error_list, const unsigned char flags, bool write_to_str) {
182
+ switch (backslash[1]) {
183
+ // \a \b \e \f \n \r \s \t \v
184
+ case '\r': {
185
+ // if this is an \r\n we need to escape both
186
+ if (write_to_str) {
187
+ dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
188
+ }
189
+
190
+ if (backslash + 2 < end && backslash[2] == '\n') {
191
+ if (write_to_str) {
192
+ dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[2]], flags);
193
+ }
194
+ return backslash + 3;
195
+ }
196
+
197
+ return backslash + 2;
198
+ }
199
+ case 'a':
200
+ case 'b':
201
+ case 'e':
202
+ case 'f':
203
+ case 'n':
204
+ case 'r':
205
+ case 's':
206
+ case 't':
207
+ case 'v':
208
+ if (write_to_str) {
209
+ dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
210
+ }
211
+ return backslash + 2;
212
+ // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
213
+ case '0': case '1': case '2': case '3': case '4':
214
+ case '5': case '6': case '7': case '8': case '9': {
215
+ unsigned char value;
216
+ const char *cursor = backslash + unescape_octal(backslash, &value);
217
+
218
+ if (write_to_str) {
219
+ dest[(*dest_length)++] = (char) unescape_char(value, flags);
220
+ }
221
+ return cursor;
222
+ }
223
+ // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
224
+ case 'x': {
225
+ unsigned char value;
226
+ const char *cursor = backslash + unescape_hexadecimal(backslash, &value);
227
+
228
+ if (write_to_str) {
229
+ dest[(*dest_length)++] = (char) unescape_char(value, flags);
230
+ }
231
+ return cursor;
232
+ }
233
+ // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
234
+ // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
235
+ case 'u': {
236
+ if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
237
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
238
+ return backslash + 2;
239
+ }
240
+
241
+ if ((backslash + 3) < end && backslash[2] == '{') {
242
+ const char *unicode_cursor = backslash + 3;
243
+ const char *extra_codepoints_start = NULL;
244
+ int codepoints_count = 0;
245
+
246
+ unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
247
+
248
+ while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
249
+ const char *unicode_start = unicode_cursor;
250
+ size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
251
+
252
+ // \u{nnnn} character literal allows only 1-6 hexadecimal digits
253
+ if (hexadecimal_length > 6)
254
+ yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
255
+
256
+ // there are not hexadecimal characters
257
+ if (hexadecimal_length == 0) {
258
+ yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
259
+ return unicode_cursor;
260
+ }
261
+
262
+ unicode_cursor += hexadecimal_length;
263
+
264
+ codepoints_count++;
265
+ if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count == 2)
266
+ extra_codepoints_start = unicode_start;
267
+
268
+ uint32_t value;
269
+ unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
270
+ if (write_to_str) {
271
+ *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
272
+ }
273
+
274
+ unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
275
+ }
276
+
277
+ // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
278
+ if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
279
+ yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
280
+
281
+ return unicode_cursor + 1;
282
+ }
283
+
284
+ if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
285
+ uint32_t value;
286
+ unescape_unicode(backslash + 2, 4, &value);
287
+
288
+ if (write_to_str) {
289
+ *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
290
+ }
291
+ return backslash + 6;
292
+ }
293
+
294
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
295
+ return backslash + 2;
296
+ }
297
+ // \c\M-x meta control character, where x is an ASCII printable character
298
+ // \c? delete, ASCII 7Fh (DEL)
299
+ // \cx control character, where x is an ASCII printable character
300
+ case 'c':
301
+ if (backslash + 2 >= end) {
302
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
303
+ return end;
304
+ }
305
+
306
+ if (flags & YP_UNESCAPE_FLAG_CONTROL) {
307
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
308
+ return backslash + 2;
309
+ }
310
+
311
+ switch (backslash[2]) {
312
+ case '\\':
313
+ return unescape(dest, dest_length, backslash + 2, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
314
+ case '?':
315
+ if (write_to_str) {
316
+ dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
317
+ }
318
+ return backslash + 3;
319
+ default: {
320
+ if (!char_is_ascii_printable(backslash[2])) {
321
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
322
+ return backslash + 2;
323
+ }
324
+
325
+ if (write_to_str) {
326
+ dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
327
+ }
328
+ return backslash + 3;
329
+ }
330
+ }
331
+ // \C-x control character, where x is an ASCII printable character
332
+ // \C-? delete, ASCII 7Fh (DEL)
333
+ case 'C':
334
+ if (backslash + 3 >= end) {
335
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
336
+ return end;
337
+ }
338
+
339
+ if (flags & YP_UNESCAPE_FLAG_CONTROL) {
340
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
341
+ return backslash + 2;
342
+ }
343
+
344
+ if (backslash[2] != '-') {
345
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
346
+ return backslash + 2;
347
+ }
348
+
349
+ switch (backslash[3]) {
350
+ case '\\':
351
+ return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
352
+ case '?':
353
+ if (write_to_str) {
354
+ dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
355
+ }
356
+ return backslash + 4;
357
+ default:
358
+ if (!char_is_ascii_printable(backslash[3])) {
359
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
360
+ return backslash + 2;
361
+ }
362
+
363
+ if (write_to_str) {
364
+ dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
365
+ }
366
+ return backslash + 4;
367
+ }
368
+ // \M-\C-x meta control character, where x is an ASCII printable character
369
+ // \M-\cx meta control character, where x is an ASCII printable character
370
+ // \M-x meta character, where x is an ASCII printable character
371
+ case 'M': {
372
+ if (backslash + 3 >= end) {
373
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
374
+ return end;
375
+ }
376
+
377
+ if (flags & YP_UNESCAPE_FLAG_META) {
378
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
379
+ return backslash + 2;
380
+ }
381
+
382
+ if (backslash[2] != '-') {
383
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
384
+ return backslash + 2;
385
+ }
386
+
387
+ if (backslash[3] == '\\') {
388
+ return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_META, write_to_str);
389
+ }
390
+
391
+ if (char_is_ascii_printable(backslash[3])) {
392
+ if (write_to_str) {
393
+ dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_META);
394
+ }
395
+ return backslash + 4;
396
+ }
397
+
398
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
399
+ return backslash + 3;
400
+ }
401
+ // In this case we're escaping something that doesn't need escaping.
402
+ default:
403
+ {
404
+ if (write_to_str) {
405
+ dest[(*dest_length)++] = backslash[1];
406
+ }
407
+ return backslash + 2;
408
+ }
409
+ }
410
+ }
411
+
412
+ /******************************************************************************/
413
+ /* Public functions and entrypoints */
414
+ /******************************************************************************/
415
+
416
+ // Unescape the contents of the given token into the given string using the
417
+ // given unescape mode. The supported escapes are:
418
+ //
419
+ // \a bell, ASCII 07h (BEL)
420
+ // \b backspace, ASCII 08h (BS)
421
+ // \t horizontal tab, ASCII 09h (TAB)
422
+ // \n newline (line feed), ASCII 0Ah (LF)
423
+ // \v vertical tab, ASCII 0Bh (VT)
424
+ // \f form feed, ASCII 0Ch (FF)
425
+ // \r carriage return, ASCII 0Dh (CR)
426
+ // \e escape, ASCII 1Bh (ESC)
427
+ // \s space, ASCII 20h (SPC)
428
+ // \\ backslash
429
+ // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
430
+ // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
431
+ // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
432
+ // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
433
+ // \cx or \C-x control character, where x is an ASCII printable character
434
+ // \M-x meta character, where x is an ASCII printable character
435
+ // \M-\C-x meta control character, where x is an ASCII printable character
436
+ // \M-\cx same as above
437
+ // \c\M-x same as above
438
+ // \c? or \C-? delete, ASCII 7Fh (DEL)
439
+ //
440
+ YP_EXPORTED_FUNCTION void
441
+ yp_unescape_manipulate_string(yp_parser_t *parser, const char *value, size_t length, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list) {
442
+ if (unescape_type == YP_UNESCAPE_NONE) {
443
+ // If we're not unescaping then we can reference the source directly.
444
+ yp_string_shared_init(string, value, value + length);
445
+ return;
446
+ }
447
+
448
+ const char *backslash = yp_memchr(value, '\\', length, parser->encoding_changed, &parser->encoding);
449
+
450
+ if (backslash == NULL) {
451
+ // Here there are no escapes, so we can reference the source directly.
452
+ yp_string_shared_init(string, value, value + length);
453
+ return;
454
+ }
455
+
456
+ // Here we have found an escape character, so we need to handle all escapes
457
+ // within the string.
458
+ char *allocated = malloc(length);
459
+ if (allocated == NULL) {
460
+ yp_diagnostic_list_append(error_list, value, value + length, "Failed to allocate memory for unescaping.");
461
+ return;
462
+ }
463
+
464
+ // This is the memory address where we're putting the unescaped string.
465
+ char *dest = allocated;
466
+ size_t dest_length = 0;
467
+
468
+ // This is the current position in the source string that we're looking at.
469
+ // It's going to move along behind the backslash so that we can copy each
470
+ // segment of the string that doesn't contain an escape.
471
+ const char *cursor = value;
472
+ const char *end = value + length;
473
+
474
+ // For each escape found in the source string, we will handle it and update
475
+ // the moving cursor->backslash window.
476
+ while (backslash != NULL && backslash + 1 < end) {
477
+ assert(dest_length < length);
478
+
479
+ // This is the size of the segment of the string from the previous escape
480
+ // or the start of the string to the current escape.
481
+ size_t segment_size = (size_t) (backslash - cursor);
482
+
483
+ // Here we're going to copy everything up until the escape into the
484
+ // destination buffer.
485
+ memcpy(dest + dest_length, cursor, segment_size);
486
+ dest_length += segment_size;
487
+
488
+ switch (backslash[1]) {
489
+ case '\\':
490
+ case '\'':
491
+ dest[dest_length++] = (char) unescape_chars[(unsigned char) backslash[1]];
492
+ cursor = backslash + 2;
493
+ break;
494
+ default:
495
+ if (unescape_type == YP_UNESCAPE_MINIMAL) {
496
+ // In this case we're escaping something that doesn't need escaping.
497
+ dest[dest_length++] = '\\';
498
+ cursor = backslash + 1;
499
+ break;
500
+ }
501
+
502
+ // This is the only type of unescaping left. In this case we need to
503
+ // handle all of the different unescapes.
504
+ assert(unescape_type == YP_UNESCAPE_ALL);
505
+ cursor = unescape(dest, &dest_length, backslash, end, error_list, YP_UNESCAPE_FLAG_NONE, true);
506
+ break;
507
+ }
508
+
509
+ if (end > cursor) {
510
+ backslash = yp_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding);
511
+ } else {
512
+ backslash = NULL;
513
+ }
514
+ }
515
+
516
+ // We need to copy the final segment of the string after the last escape.
517
+ if (end > cursor) {
518
+ memcpy(dest + dest_length, cursor, (size_t) (end - cursor));
519
+ } else {
520
+ cursor = end;
521
+ }
522
+
523
+ // We also need to update the length at the end. This is because every escape
524
+ // reduces the length of the final string, and we don't want garbage at the
525
+ // end.
526
+ yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
527
+ }
528
+
529
+ YP_EXPORTED_FUNCTION bool
530
+ yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
531
+ bool success;
532
+
533
+ yp_list_t error_list;
534
+ yp_list_init(&error_list);
535
+
536
+ yp_parser_t parser;
537
+ yp_parser_init(&parser, start, length, "");
538
+
539
+ yp_unescape_manipulate_string(&parser, start, length, result, unescape_type, &error_list);
540
+ success = yp_list_empty_p(&error_list);
541
+
542
+ yp_list_free(&error_list);
543
+ yp_parser_free(&parser);
544
+
545
+ return success;
546
+ }
547
+
548
+ // This function is similar to yp_unescape_manipulate_string, except it doesn't
549
+ // actually perform any string manipulations. Instead, it calculates how long
550
+ // the unescaped character is, and returns that value
551
+ YP_EXPORTED_FUNCTION size_t
552
+ yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list) {
553
+ assert(unescape_type != YP_UNESCAPE_NONE);
554
+
555
+ switch (backslash[1]) {
556
+ case '\\':
557
+ case '\'':
558
+ return 2;
559
+ default: {
560
+ if (unescape_type == YP_UNESCAPE_MINIMAL) return 2;
561
+
562
+ // This is the only type of unescaping left. In this case we need to
563
+ // handle all of the different unescapes.
564
+ assert(unescape_type == YP_UNESCAPE_ALL);
565
+
566
+ unsigned char flags = YP_UNESCAPE_FLAG_NONE;
567
+ if (expect_single_codepoint)
568
+ flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
569
+
570
+ const char *cursor = unescape(NULL, 0, backslash, end, error_list, flags, false);
571
+ assert(cursor > backslash);
572
+
573
+ return (size_t) (cursor - backslash);
574
+ }
575
+ }
576
+ }
@@ -0,0 +1,78 @@
1
+ #include "yarp/util/yp_buffer.h"
2
+
3
+ #define YP_BUFFER_INITIAL_SIZE 1024
4
+
5
+ // Initialize a yp_buffer_t with its default values.
6
+ bool
7
+ yp_buffer_init(yp_buffer_t *buffer) {
8
+ buffer->length = 0;
9
+ buffer->capacity = YP_BUFFER_INITIAL_SIZE;
10
+
11
+ buffer->value = (char *) malloc(YP_BUFFER_INITIAL_SIZE);
12
+ return buffer->value != NULL;
13
+ }
14
+
15
+ // Append the given amount of space to the buffer.
16
+ static inline void
17
+ yp_buffer_append_length(yp_buffer_t *buffer, size_t length) {
18
+ size_t next_length = buffer->length + length;
19
+
20
+ if (next_length > buffer->capacity) {
21
+ do {
22
+ buffer->capacity *= 2;
23
+ } while (next_length > buffer->capacity);
24
+
25
+ buffer->value = realloc(buffer->value, buffer->capacity);
26
+ }
27
+
28
+ buffer->length = next_length;
29
+ }
30
+
31
+ // Append a generic pointer to memory to the buffer.
32
+ static inline void
33
+ yp_buffer_append(yp_buffer_t *buffer, const void *source, size_t length) {
34
+ yp_buffer_append_length(buffer, length);
35
+ memcpy(buffer->value + (buffer->length - length), source, length);
36
+ }
37
+
38
+ // Append the given amount of space as zeroes to the buffer.
39
+ void
40
+ yp_buffer_append_zeroes(yp_buffer_t *buffer, size_t length) {
41
+ yp_buffer_append_length(buffer, length);
42
+ memset(buffer->value + (buffer->length - length), 0, length);
43
+ }
44
+
45
+ // Append a string to the buffer.
46
+ void
47
+ yp_buffer_append_str(yp_buffer_t *buffer, const char *value, size_t length) {
48
+ const void *source = value;
49
+ yp_buffer_append(buffer, source, length);
50
+ }
51
+
52
+ // Append a single byte to the buffer.
53
+ void
54
+ yp_buffer_append_u8(yp_buffer_t *buffer, uint8_t value) {
55
+ const void *source = &value;
56
+ yp_buffer_append(buffer, source, sizeof(uint8_t));
57
+ }
58
+
59
+ // Append a 32-bit unsigned integer to the buffer.
60
+ void
61
+ yp_buffer_append_u32(yp_buffer_t *buffer, uint32_t value) {
62
+ if (value < 128) {
63
+ yp_buffer_append_u8(buffer, (uint8_t) value);
64
+ } else {
65
+ uint32_t n = value;
66
+ while (n >= 128) {
67
+ yp_buffer_append_u8(buffer, (uint8_t) (n | 128));
68
+ n >>= 7;
69
+ }
70
+ yp_buffer_append_u8(buffer, (uint8_t) n);
71
+ }
72
+ }
73
+
74
+ // Free the memory associated with the buffer.
75
+ void
76
+ yp_buffer_free(yp_buffer_t *buffer) {
77
+ free(buffer->value);
78
+ }