yarp 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +76 -0
  3. data/CONTRIBUTING.md +51 -0
  4. data/LICENSE.md +7 -0
  5. data/Makefile.in +79 -0
  6. data/README.md +86 -0
  7. data/config.h.in +25 -0
  8. data/config.yml +2147 -0
  9. data/configure +4487 -0
  10. data/docs/build_system.md +85 -0
  11. data/docs/building.md +26 -0
  12. data/docs/configuration.md +56 -0
  13. data/docs/design.md +53 -0
  14. data/docs/encoding.md +116 -0
  15. data/docs/extension.md +20 -0
  16. data/docs/fuzzing.md +93 -0
  17. data/docs/heredocs.md +36 -0
  18. data/docs/mapping.md +117 -0
  19. data/docs/ripper.md +36 -0
  20. data/docs/serialization.md +130 -0
  21. data/docs/testing.md +55 -0
  22. data/ext/yarp/api_node.c +3680 -0
  23. data/ext/yarp/api_pack.c +256 -0
  24. data/ext/yarp/extconf.rb +131 -0
  25. data/ext/yarp/extension.c +547 -0
  26. data/ext/yarp/extension.h +18 -0
  27. data/include/yarp/ast.h +1412 -0
  28. data/include/yarp/defines.h +54 -0
  29. data/include/yarp/diagnostic.h +24 -0
  30. data/include/yarp/enc/yp_encoding.h +94 -0
  31. data/include/yarp/node.h +36 -0
  32. data/include/yarp/pack.h +141 -0
  33. data/include/yarp/parser.h +389 -0
  34. data/include/yarp/regexp.h +19 -0
  35. data/include/yarp/unescape.h +42 -0
  36. data/include/yarp/util/yp_buffer.h +39 -0
  37. data/include/yarp/util/yp_char.h +75 -0
  38. data/include/yarp/util/yp_constant_pool.h +64 -0
  39. data/include/yarp/util/yp_list.h +67 -0
  40. data/include/yarp/util/yp_memchr.h +14 -0
  41. data/include/yarp/util/yp_newline_list.h +54 -0
  42. data/include/yarp/util/yp_state_stack.h +24 -0
  43. data/include/yarp/util/yp_string.h +57 -0
  44. data/include/yarp/util/yp_string_list.h +28 -0
  45. data/include/yarp/util/yp_strpbrk.h +29 -0
  46. data/include/yarp/version.h +5 -0
  47. data/include/yarp.h +69 -0
  48. data/lib/yarp/lex_compat.rb +759 -0
  49. data/lib/yarp/node.rb +7428 -0
  50. data/lib/yarp/pack.rb +185 -0
  51. data/lib/yarp/ripper_compat.rb +174 -0
  52. data/lib/yarp/serialize.rb +389 -0
  53. data/lib/yarp.rb +330 -0
  54. data/src/diagnostic.c +25 -0
  55. data/src/enc/yp_big5.c +79 -0
  56. data/src/enc/yp_euc_jp.c +85 -0
  57. data/src/enc/yp_gbk.c +88 -0
  58. data/src/enc/yp_shift_jis.c +83 -0
  59. data/src/enc/yp_tables.c +509 -0
  60. data/src/enc/yp_unicode.c +2320 -0
  61. data/src/enc/yp_windows_31j.c +83 -0
  62. data/src/node.c +2011 -0
  63. data/src/pack.c +493 -0
  64. data/src/prettyprint.c +1782 -0
  65. data/src/regexp.c +580 -0
  66. data/src/serialize.c +1576 -0
  67. data/src/token_type.c +347 -0
  68. data/src/unescape.c +576 -0
  69. data/src/util/yp_buffer.c +78 -0
  70. data/src/util/yp_char.c +229 -0
  71. data/src/util/yp_constant_pool.c +147 -0
  72. data/src/util/yp_list.c +50 -0
  73. data/src/util/yp_memchr.c +31 -0
  74. data/src/util/yp_newline_list.c +119 -0
  75. data/src/util/yp_state_stack.c +25 -0
  76. data/src/util/yp_string.c +207 -0
  77. data/src/util/yp_string_list.c +32 -0
  78. data/src/util/yp_strncasecmp.c +20 -0
  79. data/src/util/yp_strpbrk.c +66 -0
  80. data/src/yarp.c +13211 -0
  81. data/yarp.gemspec +100 -0
  82. metadata +125 -0
data/src/unescape.c ADDED
@@ -0,0 +1,576 @@
1
+ #include "yarp.h"
2
+
3
+ /******************************************************************************/
4
+ /* Character checks */
5
+ /******************************************************************************/
6
+
7
+ static inline bool
8
+ yp_char_is_hexadecimal_digits(const char *c, size_t length) {
9
+ for (size_t index = 0; index < length; index++) {
10
+ if (!yp_char_is_hexadecimal_digit(c[index])) {
11
+ return false;
12
+ }
13
+ }
14
+ return true;
15
+ }
16
+
17
+ /******************************************************************************/
18
+ /* Lookup tables for characters */
19
+ /******************************************************************************/
20
+
21
+ // This is a lookup table for unescapes that only take up a single character.
22
+ static const unsigned char unescape_chars[] = {
23
+ ['\''] = '\'',
24
+ ['\\'] = '\\',
25
+ ['a'] = '\a',
26
+ ['b'] = '\b',
27
+ ['e'] = '\033',
28
+ ['f'] = '\f',
29
+ ['n'] = '\n',
30
+ ['r'] = '\r',
31
+ ['s'] = ' ',
32
+ ['t'] = '\t',
33
+ ['v'] = '\v'
34
+ };
35
+
36
+ // This is a lookup table for whether or not an ASCII character is printable.
37
+ static const bool ascii_printable_chars[] = {
38
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
39
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
40
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
44
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
46
+ };
47
+
48
+ static inline bool
49
+ char_is_ascii_printable(const char c) {
50
+ unsigned char v = (unsigned char) c;
51
+ return (v < 0x80) && ascii_printable_chars[v];
52
+ }
53
+
54
+ /******************************************************************************/
55
+ /* Unescaping for segments */
56
+ /******************************************************************************/
57
+
58
+ // Scan the 1-3 digits of octal into the value. Returns the number of digits
59
+ // scanned.
60
+ static inline size_t
61
+ unescape_octal(const char *backslash, unsigned char *value) {
62
+ *value = (unsigned char) (backslash[1] - '0');
63
+ if (!yp_char_is_octal_digit(backslash[2])) {
64
+ return 2;
65
+ }
66
+
67
+ *value = (unsigned char) ((*value << 3) | (backslash[2] - '0'));
68
+ if (!yp_char_is_octal_digit(backslash[3])) {
69
+ return 3;
70
+ }
71
+
72
+ *value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
73
+ return 4;
74
+ }
75
+
76
+ // Convert a hexadecimal digit into its equivalent value.
77
+ static inline unsigned char
78
+ unescape_hexadecimal_digit(const char value) {
79
+ return (unsigned char) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
80
+ }
81
+
82
+ // Scan the 1-2 digits of hexadecimal into the value. Returns the number of
83
+ // digits scanned.
84
+ static inline size_t
85
+ unescape_hexadecimal(const char *backslash, unsigned char *value) {
86
+ *value = unescape_hexadecimal_digit(backslash[2]);
87
+ if (!yp_char_is_hexadecimal_digit(backslash[3])) {
88
+ return 3;
89
+ }
90
+
91
+ *value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
92
+ return 4;
93
+ }
94
+
95
+ // Scan the 4 digits of a Unicode escape into the value. Returns the number of
96
+ // digits scanned. This function assumes that the characters have already been
97
+ // validated.
98
+ static inline void
99
+ unescape_unicode(const char *string, size_t length, uint32_t *value) {
100
+ *value = 0;
101
+ for (size_t index = 0; index < length; index++) {
102
+ if (index != 0) *value <<= 4;
103
+ *value |= unescape_hexadecimal_digit(string[index]);
104
+ }
105
+ }
106
+
107
+ // Accepts the pointer to the string to write the unicode value along with the
108
+ // 32-bit value to write. Writes the UTF-8 representation of the value to the
109
+ // string and returns the number of bytes written.
110
+ static inline size_t
111
+ unescape_unicode_write(char *dest, uint32_t value, const char *start, const char *end, yp_list_t *error_list) {
112
+ unsigned char *bytes = (unsigned char *) dest;
113
+
114
+ if (value <= 0x7F) {
115
+ // 0xxxxxxx
116
+ bytes[0] = (unsigned char) value;
117
+ return 1;
118
+ }
119
+
120
+ if (value <= 0x7FF) {
121
+ // 110xxxxx 10xxxxxx
122
+ bytes[0] = (unsigned char) (0xC0 | (value >> 6));
123
+ bytes[1] = (unsigned char) (0x80 | (value & 0x3F));
124
+ return 2;
125
+ }
126
+
127
+ if (value <= 0xFFFF) {
128
+ // 1110xxxx 10xxxxxx 10xxxxxx
129
+ bytes[0] = (unsigned char) (0xE0 | (value >> 12));
130
+ bytes[1] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
131
+ bytes[2] = (unsigned char) (0x80 | (value & 0x3F));
132
+ return 3;
133
+ }
134
+
135
+ // At this point it must be a 4 digit UTF-8 representation. If it's not, then
136
+ // the input is invalid.
137
+ if (value <= 0x10FFFF) {
138
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
139
+ bytes[0] = (unsigned char) (0xF0 | (value >> 18));
140
+ bytes[1] = (unsigned char) (0x80 | ((value >> 12) & 0x3F));
141
+ bytes[2] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
142
+ bytes[3] = (unsigned char) (0x80 | (value & 0x3F));
143
+ return 4;
144
+ }
145
+
146
+ // If we get here, then the value is too big. This is an error, but we don't
147
+ // want to just crash, so instead we'll add an error to the error list and put
148
+ // in a replacement character instead.
149
+ yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
150
+ bytes[0] = 0xEF;
151
+ bytes[1] = 0xBF;
152
+ bytes[2] = 0xBD;
153
+ return 3;
154
+ }
155
+
156
+ typedef enum {
157
+ YP_UNESCAPE_FLAG_NONE = 0,
158
+ YP_UNESCAPE_FLAG_CONTROL = 1,
159
+ YP_UNESCAPE_FLAG_META = 2,
160
+ YP_UNESCAPE_FLAG_EXPECT_SINGLE = 4
161
+ } yp_unescape_flag_t;
162
+
163
+ // Unescape a single character value based on the given flags.
164
+ static inline unsigned char
165
+ unescape_char(const unsigned char value, const unsigned char flags) {
166
+ unsigned char unescaped = value;
167
+
168
+ if (flags & YP_UNESCAPE_FLAG_CONTROL) {
169
+ unescaped &= 0x1f;
170
+ }
171
+
172
+ if (flags & YP_UNESCAPE_FLAG_META) {
173
+ unescaped |= 0x80;
174
+ }
175
+
176
+ return unescaped;
177
+ }
178
+
179
+ // Read a specific escape sequence into the given destination.
180
+ static const char *
181
+ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end, yp_list_t *error_list, const unsigned char flags, bool write_to_str) {
182
+ switch (backslash[1]) {
183
+ // \a \b \e \f \n \r \s \t \v
184
+ case '\r': {
185
+ // if this is an \r\n we need to escape both
186
+ if (write_to_str) {
187
+ dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
188
+ }
189
+
190
+ if (backslash + 2 < end && backslash[2] == '\n') {
191
+ if (write_to_str) {
192
+ dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[2]], flags);
193
+ }
194
+ return backslash + 3;
195
+ }
196
+
197
+ return backslash + 2;
198
+ }
199
+ case 'a':
200
+ case 'b':
201
+ case 'e':
202
+ case 'f':
203
+ case 'n':
204
+ case 'r':
205
+ case 's':
206
+ case 't':
207
+ case 'v':
208
+ if (write_to_str) {
209
+ dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
210
+ }
211
+ return backslash + 2;
212
+ // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
213
+ case '0': case '1': case '2': case '3': case '4':
214
+ case '5': case '6': case '7': case '8': case '9': {
215
+ unsigned char value;
216
+ const char *cursor = backslash + unescape_octal(backslash, &value);
217
+
218
+ if (write_to_str) {
219
+ dest[(*dest_length)++] = (char) unescape_char(value, flags);
220
+ }
221
+ return cursor;
222
+ }
223
+ // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
224
+ case 'x': {
225
+ unsigned char value;
226
+ const char *cursor = backslash + unescape_hexadecimal(backslash, &value);
227
+
228
+ if (write_to_str) {
229
+ dest[(*dest_length)++] = (char) unescape_char(value, flags);
230
+ }
231
+ return cursor;
232
+ }
233
+ // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
234
+ // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
235
+ case 'u': {
236
+ if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
237
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
238
+ return backslash + 2;
239
+ }
240
+
241
+ if ((backslash + 3) < end && backslash[2] == '{') {
242
+ const char *unicode_cursor = backslash + 3;
243
+ const char *extra_codepoints_start = NULL;
244
+ int codepoints_count = 0;
245
+
246
+ unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
247
+
248
+ while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
249
+ const char *unicode_start = unicode_cursor;
250
+ size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
251
+
252
+ // \u{nnnn} character literal allows only 1-6 hexadecimal digits
253
+ if (hexadecimal_length > 6)
254
+ yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
255
+
256
+ // there are not hexadecimal characters
257
+ if (hexadecimal_length == 0) {
258
+ yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
259
+ return unicode_cursor;
260
+ }
261
+
262
+ unicode_cursor += hexadecimal_length;
263
+
264
+ codepoints_count++;
265
+ if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count == 2)
266
+ extra_codepoints_start = unicode_start;
267
+
268
+ uint32_t value;
269
+ unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
270
+ if (write_to_str) {
271
+ *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
272
+ }
273
+
274
+ unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
275
+ }
276
+
277
+ // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
278
+ if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
279
+ yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
280
+
281
+ return unicode_cursor + 1;
282
+ }
283
+
284
+ if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
285
+ uint32_t value;
286
+ unescape_unicode(backslash + 2, 4, &value);
287
+
288
+ if (write_to_str) {
289
+ *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
290
+ }
291
+ return backslash + 6;
292
+ }
293
+
294
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
295
+ return backslash + 2;
296
+ }
297
+ // \c\M-x meta control character, where x is an ASCII printable character
298
+ // \c? delete, ASCII 7Fh (DEL)
299
+ // \cx control character, where x is an ASCII printable character
300
+ case 'c':
301
+ if (backslash + 2 >= end) {
302
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
303
+ return end;
304
+ }
305
+
306
+ if (flags & YP_UNESCAPE_FLAG_CONTROL) {
307
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
308
+ return backslash + 2;
309
+ }
310
+
311
+ switch (backslash[2]) {
312
+ case '\\':
313
+ return unescape(dest, dest_length, backslash + 2, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
314
+ case '?':
315
+ if (write_to_str) {
316
+ dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
317
+ }
318
+ return backslash + 3;
319
+ default: {
320
+ if (!char_is_ascii_printable(backslash[2])) {
321
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
322
+ return backslash + 2;
323
+ }
324
+
325
+ if (write_to_str) {
326
+ dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
327
+ }
328
+ return backslash + 3;
329
+ }
330
+ }
331
+ // \C-x control character, where x is an ASCII printable character
332
+ // \C-? delete, ASCII 7Fh (DEL)
333
+ case 'C':
334
+ if (backslash + 3 >= end) {
335
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
336
+ return end;
337
+ }
338
+
339
+ if (flags & YP_UNESCAPE_FLAG_CONTROL) {
340
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
341
+ return backslash + 2;
342
+ }
343
+
344
+ if (backslash[2] != '-') {
345
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
346
+ return backslash + 2;
347
+ }
348
+
349
+ switch (backslash[3]) {
350
+ case '\\':
351
+ return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
352
+ case '?':
353
+ if (write_to_str) {
354
+ dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
355
+ }
356
+ return backslash + 4;
357
+ default:
358
+ if (!char_is_ascii_printable(backslash[3])) {
359
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
360
+ return backslash + 2;
361
+ }
362
+
363
+ if (write_to_str) {
364
+ dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
365
+ }
366
+ return backslash + 4;
367
+ }
368
+ // \M-\C-x meta control character, where x is an ASCII printable character
369
+ // \M-\cx meta control character, where x is an ASCII printable character
370
+ // \M-x meta character, where x is an ASCII printable character
371
+ case 'M': {
372
+ if (backslash + 3 >= end) {
373
+ yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
374
+ return end;
375
+ }
376
+
377
+ if (flags & YP_UNESCAPE_FLAG_META) {
378
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
379
+ return backslash + 2;
380
+ }
381
+
382
+ if (backslash[2] != '-') {
383
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
384
+ return backslash + 2;
385
+ }
386
+
387
+ if (backslash[3] == '\\') {
388
+ return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_META, write_to_str);
389
+ }
390
+
391
+ if (char_is_ascii_printable(backslash[3])) {
392
+ if (write_to_str) {
393
+ dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_META);
394
+ }
395
+ return backslash + 4;
396
+ }
397
+
398
+ yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
399
+ return backslash + 3;
400
+ }
401
+ // In this case we're escaping something that doesn't need escaping.
402
+ default:
403
+ {
404
+ if (write_to_str) {
405
+ dest[(*dest_length)++] = backslash[1];
406
+ }
407
+ return backslash + 2;
408
+ }
409
+ }
410
+ }
411
+
412
+ /******************************************************************************/
413
+ /* Public functions and entrypoints */
414
+ /******************************************************************************/
415
+
416
+ // Unescape the contents of the given token into the given string using the
417
+ // given unescape mode. The supported escapes are:
418
+ //
419
+ // \a bell, ASCII 07h (BEL)
420
+ // \b backspace, ASCII 08h (BS)
421
+ // \t horizontal tab, ASCII 09h (TAB)
422
+ // \n newline (line feed), ASCII 0Ah (LF)
423
+ // \v vertical tab, ASCII 0Bh (VT)
424
+ // \f form feed, ASCII 0Ch (FF)
425
+ // \r carriage return, ASCII 0Dh (CR)
426
+ // \e escape, ASCII 1Bh (ESC)
427
+ // \s space, ASCII 20h (SPC)
428
+ // \\ backslash
429
+ // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
430
+ // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
431
+ // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
432
+ // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
433
+ // \cx or \C-x control character, where x is an ASCII printable character
434
+ // \M-x meta character, where x is an ASCII printable character
435
+ // \M-\C-x meta control character, where x is an ASCII printable character
436
+ // \M-\cx same as above
437
+ // \c\M-x same as above
438
+ // \c? or \C-? delete, ASCII 7Fh (DEL)
439
+ //
440
+ YP_EXPORTED_FUNCTION void
441
+ yp_unescape_manipulate_string(yp_parser_t *parser, const char *value, size_t length, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list) {
442
+ if (unescape_type == YP_UNESCAPE_NONE) {
443
+ // If we're not unescaping then we can reference the source directly.
444
+ yp_string_shared_init(string, value, value + length);
445
+ return;
446
+ }
447
+
448
+ const char *backslash = yp_memchr(value, '\\', length, parser->encoding_changed, &parser->encoding);
449
+
450
+ if (backslash == NULL) {
451
+ // Here there are no escapes, so we can reference the source directly.
452
+ yp_string_shared_init(string, value, value + length);
453
+ return;
454
+ }
455
+
456
+ // Here we have found an escape character, so we need to handle all escapes
457
+ // within the string.
458
+ char *allocated = malloc(length);
459
+ if (allocated == NULL) {
460
+ yp_diagnostic_list_append(error_list, value, value + length, "Failed to allocate memory for unescaping.");
461
+ return;
462
+ }
463
+
464
+ // This is the memory address where we're putting the unescaped string.
465
+ char *dest = allocated;
466
+ size_t dest_length = 0;
467
+
468
+ // This is the current position in the source string that we're looking at.
469
+ // It's going to move along behind the backslash so that we can copy each
470
+ // segment of the string that doesn't contain an escape.
471
+ const char *cursor = value;
472
+ const char *end = value + length;
473
+
474
+ // For each escape found in the source string, we will handle it and update
475
+ // the moving cursor->backslash window.
476
+ while (backslash != NULL && backslash + 1 < end) {
477
+ assert(dest_length < length);
478
+
479
+ // This is the size of the segment of the string from the previous escape
480
+ // or the start of the string to the current escape.
481
+ size_t segment_size = (size_t) (backslash - cursor);
482
+
483
+ // Here we're going to copy everything up until the escape into the
484
+ // destination buffer.
485
+ memcpy(dest + dest_length, cursor, segment_size);
486
+ dest_length += segment_size;
487
+
488
+ switch (backslash[1]) {
489
+ case '\\':
490
+ case '\'':
491
+ dest[dest_length++] = (char) unescape_chars[(unsigned char) backslash[1]];
492
+ cursor = backslash + 2;
493
+ break;
494
+ default:
495
+ if (unescape_type == YP_UNESCAPE_MINIMAL) {
496
+ // In this case we're escaping something that doesn't need escaping.
497
+ dest[dest_length++] = '\\';
498
+ cursor = backslash + 1;
499
+ break;
500
+ }
501
+
502
+ // This is the only type of unescaping left. In this case we need to
503
+ // handle all of the different unescapes.
504
+ assert(unescape_type == YP_UNESCAPE_ALL);
505
+ cursor = unescape(dest, &dest_length, backslash, end, error_list, YP_UNESCAPE_FLAG_NONE, true);
506
+ break;
507
+ }
508
+
509
+ if (end > cursor) {
510
+ backslash = yp_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding);
511
+ } else {
512
+ backslash = NULL;
513
+ }
514
+ }
515
+
516
+ // We need to copy the final segment of the string after the last escape.
517
+ if (end > cursor) {
518
+ memcpy(dest + dest_length, cursor, (size_t) (end - cursor));
519
+ } else {
520
+ cursor = end;
521
+ }
522
+
523
+ // We also need to update the length at the end. This is because every escape
524
+ // reduces the length of the final string, and we don't want garbage at the
525
+ // end.
526
+ yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
527
+ }
528
+
529
+ YP_EXPORTED_FUNCTION bool
530
+ yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
531
+ bool success;
532
+
533
+ yp_list_t error_list;
534
+ yp_list_init(&error_list);
535
+
536
+ yp_parser_t parser;
537
+ yp_parser_init(&parser, start, length, "");
538
+
539
+ yp_unescape_manipulate_string(&parser, start, length, result, unescape_type, &error_list);
540
+ success = yp_list_empty_p(&error_list);
541
+
542
+ yp_list_free(&error_list);
543
+ yp_parser_free(&parser);
544
+
545
+ return success;
546
+ }
547
+
548
+ // This function is similar to yp_unescape_manipulate_string, except it doesn't
549
+ // actually perform any string manipulations. Instead, it calculates how long
550
+ // the unescaped character is, and returns that value
551
+ YP_EXPORTED_FUNCTION size_t
552
+ yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list) {
553
+ assert(unescape_type != YP_UNESCAPE_NONE);
554
+
555
+ switch (backslash[1]) {
556
+ case '\\':
557
+ case '\'':
558
+ return 2;
559
+ default: {
560
+ if (unescape_type == YP_UNESCAPE_MINIMAL) return 2;
561
+
562
+ // This is the only type of unescaping left. In this case we need to
563
+ // handle all of the different unescapes.
564
+ assert(unescape_type == YP_UNESCAPE_ALL);
565
+
566
+ unsigned char flags = YP_UNESCAPE_FLAG_NONE;
567
+ if (expect_single_codepoint)
568
+ flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
569
+
570
+ const char *cursor = unescape(NULL, 0, backslash, end, error_list, flags, false);
571
+ assert(cursor > backslash);
572
+
573
+ return (size_t) (cursor - backslash);
574
+ }
575
+ }
576
+ }
@@ -0,0 +1,78 @@
1
+ #include "yarp/util/yp_buffer.h"
2
+
3
+ #define YP_BUFFER_INITIAL_SIZE 1024
4
+
5
+ // Initialize a yp_buffer_t with its default values.
6
+ bool
7
+ yp_buffer_init(yp_buffer_t *buffer) {
8
+ buffer->length = 0;
9
+ buffer->capacity = YP_BUFFER_INITIAL_SIZE;
10
+
11
+ buffer->value = (char *) malloc(YP_BUFFER_INITIAL_SIZE);
12
+ return buffer->value != NULL;
13
+ }
14
+
15
+ // Append the given amount of space to the buffer.
16
+ static inline void
17
+ yp_buffer_append_length(yp_buffer_t *buffer, size_t length) {
18
+ size_t next_length = buffer->length + length;
19
+
20
+ if (next_length > buffer->capacity) {
21
+ do {
22
+ buffer->capacity *= 2;
23
+ } while (next_length > buffer->capacity);
24
+
25
+ buffer->value = realloc(buffer->value, buffer->capacity);
26
+ }
27
+
28
+ buffer->length = next_length;
29
+ }
30
+
31
+ // Append a generic pointer to memory to the buffer.
32
+ static inline void
33
+ yp_buffer_append(yp_buffer_t *buffer, const void *source, size_t length) {
34
+ yp_buffer_append_length(buffer, length);
35
+ memcpy(buffer->value + (buffer->length - length), source, length);
36
+ }
37
+
38
+ // Append the given amount of space as zeroes to the buffer.
39
+ void
40
+ yp_buffer_append_zeroes(yp_buffer_t *buffer, size_t length) {
41
+ yp_buffer_append_length(buffer, length);
42
+ memset(buffer->value + (buffer->length - length), 0, length);
43
+ }
44
+
45
+ // Append a string to the buffer.
46
+ void
47
+ yp_buffer_append_str(yp_buffer_t *buffer, const char *value, size_t length) {
48
+ const void *source = value;
49
+ yp_buffer_append(buffer, source, length);
50
+ }
51
+
52
+ // Append a single byte to the buffer.
53
+ void
54
+ yp_buffer_append_u8(yp_buffer_t *buffer, uint8_t value) {
55
+ const void *source = &value;
56
+ yp_buffer_append(buffer, source, sizeof(uint8_t));
57
+ }
58
+
59
+ // Append a 32-bit unsigned integer to the buffer.
60
+ void
61
+ yp_buffer_append_u32(yp_buffer_t *buffer, uint32_t value) {
62
+ if (value < 128) {
63
+ yp_buffer_append_u8(buffer, (uint8_t) value);
64
+ } else {
65
+ uint32_t n = value;
66
+ while (n >= 128) {
67
+ yp_buffer_append_u8(buffer, (uint8_t) (n | 128));
68
+ n >>= 7;
69
+ }
70
+ yp_buffer_append_u8(buffer, (uint8_t) n);
71
+ }
72
+ }
73
+
74
+ // Free the memory associated with the buffer.
75
+ void
76
+ yp_buffer_free(yp_buffer_t *buffer) {
77
+ free(buffer->value);
78
+ }