RubyGems - yarp - Versions diffs - 0.6.0 - Mend

yarp 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

checksums.yaml +7 -0
data/CODE_OF_CONDUCT.md +76 -0
data/CONTRIBUTING.md +51 -0
data/LICENSE.md +7 -0
data/Makefile.in +79 -0
data/README.md +86 -0
data/config.h.in +25 -0
data/config.yml +2147 -0
data/configure +4487 -0
data/docs/build_system.md +85 -0
data/docs/building.md +26 -0
data/docs/configuration.md +56 -0
data/docs/design.md +53 -0
data/docs/encoding.md +116 -0
data/docs/extension.md +20 -0
data/docs/fuzzing.md +93 -0
data/docs/heredocs.md +36 -0
data/docs/mapping.md +117 -0
data/docs/ripper.md +36 -0
data/docs/serialization.md +130 -0
data/docs/testing.md +55 -0
data/ext/yarp/api_node.c +3680 -0
data/ext/yarp/api_pack.c +256 -0
data/ext/yarp/extconf.rb +131 -0
data/ext/yarp/extension.c +547 -0
data/ext/yarp/extension.h +18 -0
data/include/yarp/ast.h +1412 -0
data/include/yarp/defines.h +54 -0
data/include/yarp/diagnostic.h +24 -0
data/include/yarp/enc/yp_encoding.h +94 -0
data/include/yarp/node.h +36 -0
data/include/yarp/pack.h +141 -0
data/include/yarp/parser.h +389 -0
data/include/yarp/regexp.h +19 -0
data/include/yarp/unescape.h +42 -0
data/include/yarp/util/yp_buffer.h +39 -0
data/include/yarp/util/yp_char.h +75 -0
data/include/yarp/util/yp_constant_pool.h +64 -0
data/include/yarp/util/yp_list.h +67 -0
data/include/yarp/util/yp_memchr.h +14 -0
data/include/yarp/util/yp_newline_list.h +54 -0
data/include/yarp/util/yp_state_stack.h +24 -0
data/include/yarp/util/yp_string.h +57 -0
data/include/yarp/util/yp_string_list.h +28 -0
data/include/yarp/util/yp_strpbrk.h +29 -0
data/include/yarp/version.h +5 -0
data/include/yarp.h +69 -0
data/lib/yarp/lex_compat.rb +759 -0
data/lib/yarp/node.rb +7428 -0
data/lib/yarp/pack.rb +185 -0
data/lib/yarp/ripper_compat.rb +174 -0
data/lib/yarp/serialize.rb +389 -0
data/lib/yarp.rb +330 -0
data/src/diagnostic.c +25 -0
data/src/enc/yp_big5.c +79 -0
data/src/enc/yp_euc_jp.c +85 -0
data/src/enc/yp_gbk.c +88 -0
data/src/enc/yp_shift_jis.c +83 -0
data/src/enc/yp_tables.c +509 -0
data/src/enc/yp_unicode.c +2320 -0
data/src/enc/yp_windows_31j.c +83 -0
data/src/node.c +2011 -0
data/src/pack.c +493 -0
data/src/prettyprint.c +1782 -0
data/src/regexp.c +580 -0
data/src/serialize.c +1576 -0
data/src/token_type.c +347 -0
data/src/unescape.c +576 -0
data/src/util/yp_buffer.c +78 -0
data/src/util/yp_char.c +229 -0
data/src/util/yp_constant_pool.c +147 -0
data/src/util/yp_list.c +50 -0
data/src/util/yp_memchr.c +31 -0
data/src/util/yp_newline_list.c +119 -0
data/src/util/yp_state_stack.c +25 -0
data/src/util/yp_string.c +207 -0
data/src/util/yp_string_list.c +32 -0
data/src/util/yp_strncasecmp.c +20 -0
data/src/util/yp_strpbrk.c +66 -0
data/src/yarp.c +13211 -0
data/yarp.gemspec +100 -0
metadata +125 -0

data/src/unescape.c ADDED Viewed

@@ -0,0 +1,576 @@
+#include "yarp.h"
+/******************************************************************************/
+/* Character checks                                                           */
+/******************************************************************************/
+static inline bool
+yp_char_is_hexadecimal_digits(const char *c, size_t length) {
+    for (size_t index = 0; index < length; index++) {
+        if (!yp_char_is_hexadecimal_digit(c[index])) {
+            return false;
+        }
+    }
+    return true;
+}
+/******************************************************************************/
+/* Lookup tables for characters                                               */
+/******************************************************************************/
+// This is a lookup table for unescapes that only take up a single character.
+static const unsigned char unescape_chars[] = {
+    ['\''] = '\'',
+    ['\\'] = '\\',
+    ['a'] = '\a',
+    ['b'] = '\b',
+    ['e'] = '\033',
+    ['f'] = '\f',
+    ['n'] = '\n',
+    ['r'] = '\r',
+    ['s'] = ' ',
+    ['t'] = '\t',
+    ['v'] = '\v'
+};
+// This is a lookup table for whether or not an ASCII character is printable.
+static const bool ascii_printable_chars[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
+};
+static inline bool
+char_is_ascii_printable(const char c) {
+    unsigned char v = (unsigned char) c;
+    return (v < 0x80) && ascii_printable_chars[v];
+}
+/******************************************************************************/
+/* Unescaping for segments                                                    */
+/******************************************************************************/
+// Scan the 1-3 digits of octal into the value. Returns the number of digits
+// scanned.
+static inline size_t
+unescape_octal(const char *backslash, unsigned char *value) {
+    *value = (unsigned char) (backslash[1] - '0');
+    if (!yp_char_is_octal_digit(backslash[2])) {
+        return 2;
+    }
+    *value = (unsigned char) ((*value << 3) | (backslash[2] - '0'));
+    if (!yp_char_is_octal_digit(backslash[3])) {
+        return 3;
+    }
+    *value = (unsigned char) ((*value << 3) | (backslash[3] - '0'));
+    return 4;
+}
+// Convert a hexadecimal digit into its equivalent value.
+static inline unsigned char
+unescape_hexadecimal_digit(const char value) {
+    return (unsigned char) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
+}
+// Scan the 1-2 digits of hexadecimal into the value. Returns the number of
+// digits scanned.
+static inline size_t
+unescape_hexadecimal(const char *backslash, unsigned char *value) {
+    *value = unescape_hexadecimal_digit(backslash[2]);
+    if (!yp_char_is_hexadecimal_digit(backslash[3])) {
+        return 3;
+    }
+    *value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
+    return 4;
+}
+// Scan the 4 digits of a Unicode escape into the value. Returns the number of
+// digits scanned. This function assumes that the characters have already been
+// validated.
+static inline void
+unescape_unicode(const char *string, size_t length, uint32_t *value) {
+    *value = 0;
+    for (size_t index = 0; index < length; index++) {
+        if (index != 0) *value <<= 4;
+        *value |= unescape_hexadecimal_digit(string[index]);
+    }
+}
+// Accepts the pointer to the string to write the unicode value along with the
+// 32-bit value to write. Writes the UTF-8 representation of the value to the
+// string and returns the number of bytes written.
+static inline size_t
+unescape_unicode_write(char *dest, uint32_t value, const char *start, const char *end, yp_list_t *error_list) {
+    unsigned char *bytes = (unsigned char *) dest;
+    if (value <= 0x7F) {
+        // 0xxxxxxx
+        bytes[0] = (unsigned char) value;
+        return 1;
+    }
+    if (value <= 0x7FF) {
+        // 110xxxxx 10xxxxxx
+        bytes[0] = (unsigned char) (0xC0 | (value >> 6));
+        bytes[1] = (unsigned char) (0x80 | (value & 0x3F));
+        return 2;
+    }
+    if (value <= 0xFFFF) {
+        // 1110xxxx 10xxxxxx 10xxxxxx
+        bytes[0] = (unsigned char) (0xE0 | (value >> 12));
+        bytes[1] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
+        bytes[2] = (unsigned char) (0x80 | (value & 0x3F));
+        return 3;
+    }
+    // At this point it must be a 4 digit UTF-8 representation. If it's not, then
+    // the input is invalid.
+    if (value <= 0x10FFFF) {
+        // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        bytes[0] = (unsigned char) (0xF0 | (value >> 18));
+        bytes[1] = (unsigned char) (0x80 | ((value >> 12) & 0x3F));
+        bytes[2] = (unsigned char) (0x80 | ((value >> 6) & 0x3F));
+        bytes[3] = (unsigned char) (0x80 | (value & 0x3F));
+        return 4;
+    }
+    // If we get here, then the value is too big. This is an error, but we don't
+    // want to just crash, so instead we'll add an error to the error list and put
+    // in a replacement character instead.
+    yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
+    bytes[0] = 0xEF;
+    bytes[1] = 0xBF;
+    bytes[2] = 0xBD;
+    return 3;
+}
+typedef enum {
+    YP_UNESCAPE_FLAG_NONE = 0,
+    YP_UNESCAPE_FLAG_CONTROL = 1,
+    YP_UNESCAPE_FLAG_META = 2,
+    YP_UNESCAPE_FLAG_EXPECT_SINGLE = 4
+} yp_unescape_flag_t;
+// Unescape a single character value based on the given flags.
+static inline unsigned char
+unescape_char(const unsigned char value, const unsigned char flags) {
+    unsigned char unescaped = value;
+    if (flags & YP_UNESCAPE_FLAG_CONTROL) {
+        unescaped &= 0x1f;
+    }
+    if (flags & YP_UNESCAPE_FLAG_META) {
+        unescaped |= 0x80;
+    }
+    return unescaped;
+}
+// Read a specific escape sequence into the given destination.
+static const char *
+unescape(char *dest, size_t *dest_length, const char *backslash, const char *end, yp_list_t *error_list, const unsigned char flags, bool write_to_str) {
+    switch (backslash[1]) {
+        // \a \b \e \f \n \r \s \t \v
+        case '\r': {
+            // if this is an \r\n we need to escape both
+            if (write_to_str) {
+                dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
+            }
+            if (backslash + 2 < end && backslash[2] == '\n') {
+                if (write_to_str) {
+                    dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[2]], flags);
+                }
+                return backslash + 3;
+            }
+            return backslash + 2;
+        }
+        case 'a':
+        case 'b':
+        case 'e':
+        case 'f':
+        case 'n':
+        case 'r':
+        case 's':
+        case 't':
+        case 'v':
+            if (write_to_str) {
+                dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags);
+            }
+            return backslash + 2;
+        // \nnn         octal bit pattern, where nnn is 1-3 octal digits ([0-7])
+        case '0': case '1': case '2': case '3': case '4':
+        case '5': case '6': case '7': case '8': case '9': {
+            unsigned char value;
+            const char *cursor = backslash + unescape_octal(backslash, &value);
+            if (write_to_str) {
+                dest[(*dest_length)++] = (char) unescape_char(value, flags);
+            }
+            return cursor;
+        }
+        // \xnn         hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
+        case 'x': {
+            unsigned char value;
+            const char *cursor = backslash + unescape_hexadecimal(backslash, &value);
+            if (write_to_str) {
+                dest[(*dest_length)++] = (char) unescape_char(value, flags);
+            }
+            return cursor;
+        }
+        // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
+        // \unnnn       Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
+        case 'u': {
+            if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
+                yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
+                return backslash + 2;
+            }
+            if ((backslash + 3) < end && backslash[2] == '{') {
+                const char *unicode_cursor = backslash + 3;
+                const char *extra_codepoints_start = NULL;
+                int codepoints_count = 0;
+                unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
+                while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
+                    const char *unicode_start = unicode_cursor;
+                    size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
+                    // \u{nnnn} character literal allows only 1-6 hexadecimal digits
+                    if (hexadecimal_length > 6)
+                        yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
+                    // there are not hexadecimal characters
+                    if (hexadecimal_length == 0) {
+                        yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
+                        return unicode_cursor;
+                    }
+                    unicode_cursor += hexadecimal_length;
+                    codepoints_count++;
+                    if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count == 2)
+                        extra_codepoints_start = unicode_start;
+                    uint32_t value;
+                    unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
+                    if (write_to_str) {
+                        *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
+                    }
+                    unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
+                }
+                // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
+                if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
+                    yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
+                return unicode_cursor + 1;
+            }
+            if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
+                uint32_t value;
+                unescape_unicode(backslash + 2, 4, &value);
+                if (write_to_str) {
+                    *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
+                }
+                return backslash + 6;
+            }
+            yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
+            return backslash + 2;
+        }
+        // \c\M-x       meta control character, where x is an ASCII printable character
+        // \c?          delete, ASCII 7Fh (DEL)
+        // \cx          control character, where x is an ASCII printable character
+        case 'c':
+            if (backslash + 2 >= end) {
+                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                return end;
+            }
+            if (flags & YP_UNESCAPE_FLAG_CONTROL) {
+                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
+                return backslash + 2;
+            }
+            switch (backslash[2]) {
+                case '\\':
+                    return unescape(dest, dest_length, backslash + 2, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
+                case '?':
+                    if (write_to_str) {
+                        dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
+                    }
+                    return backslash + 3;
+                default: {
+                    if (!char_is_ascii_printable(backslash[2])) {
+                        yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                        return backslash + 2;
+                    }
+                    if (write_to_str) {
+                        dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
+                    }
+                    return backslash + 3;
+                }
+            }
+        // \C-x         control character, where x is an ASCII printable character
+        // \C-?         delete, ASCII 7Fh (DEL)
+        case 'C':
+            if (backslash + 3 >= end) {
+                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                return end;
+            }
+            if (flags & YP_UNESCAPE_FLAG_CONTROL) {
+                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
+                return backslash + 2;
+            }
+            if (backslash[2] != '-') {
+                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                return backslash + 2;
+            }
+            switch (backslash[3]) {
+                case '\\':
+                    return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
+                case '?':
+                    if (write_to_str) {
+                        dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
+                    }
+                    return backslash + 4;
+                default:
+                    if (!char_is_ascii_printable(backslash[3])) {
+                        yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
+                        return backslash + 2;
+                    }
+                    if (write_to_str) {
+                        dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
+                    }
+                    return backslash + 4;
+            }
+        // \M-\C-x      meta control character, where x is an ASCII printable character
+        // \M-\cx       meta control character, where x is an ASCII printable character
+        // \M-x         meta character, where x is an ASCII printable character
+        case 'M': {
+            if (backslash + 3 >= end) {
+                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                return end;
+            }
+            if (flags & YP_UNESCAPE_FLAG_META) {
+                yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
+                return backslash + 2;
+            }
+            if (backslash[2] != '-') {
+                yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
+                return backslash + 2;
+            }
+            if (backslash[3] == '\\') {
+                return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_META, write_to_str);
+            }
+            if (char_is_ascii_printable(backslash[3])) {
+                if (write_to_str) {
+                    dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_META);
+                }
+                return backslash + 4;
+            }
+            yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
+            return backslash + 3;
+        }
+        // In this case we're escaping something that doesn't need escaping.
+        default:
+            {
+                if (write_to_str) {
+                    dest[(*dest_length)++] = backslash[1];
+                }
+                return backslash + 2;
+            }
+    }
+}
+/******************************************************************************/
+/* Public functions and entrypoints                                           */
+/******************************************************************************/
+// Unescape the contents of the given token into the given string using the
+// given unescape mode. The supported escapes are:
+//
+// \a             bell, ASCII 07h (BEL)
+// \b             backspace, ASCII 08h (BS)
+// \t             horizontal tab, ASCII 09h (TAB)
+// \n             newline (line feed), ASCII 0Ah (LF)
+// \v             vertical tab, ASCII 0Bh (VT)
+// \f             form feed, ASCII 0Ch (FF)
+// \r             carriage return, ASCII 0Dh (CR)
+// \e             escape, ASCII 1Bh (ESC)
+// \s             space, ASCII 20h (SPC)
+// \\             backslash
+// \nnn           octal bit pattern, where nnn is 1-3 octal digits ([0-7])
+// \xnn           hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
+// \unnnn         Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
+// \u{nnnn ...}   Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
+// \cx or \C-x    control character, where x is an ASCII printable character
+// \M-x           meta character, where x is an ASCII printable character
+// \M-\C-x        meta control character, where x is an ASCII printable character
+// \M-\cx         same as above
+// \c\M-x         same as above
+// \c? or \C-?    delete, ASCII 7Fh (DEL)
+//
+YP_EXPORTED_FUNCTION void
+yp_unescape_manipulate_string(yp_parser_t *parser, const char *value, size_t length, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list) {
+    if (unescape_type == YP_UNESCAPE_NONE) {
+        // If we're not unescaping then we can reference the source directly.
+        yp_string_shared_init(string, value, value + length);
+        return;
+    }
+    const char *backslash = yp_memchr(value, '\\', length, parser->encoding_changed, &parser->encoding);
+    if (backslash == NULL) {
+        // Here there are no escapes, so we can reference the source directly.
+        yp_string_shared_init(string, value, value + length);
+        return;
+    }
+    // Here we have found an escape character, so we need to handle all escapes
+    // within the string.
+    char *allocated = malloc(length);
+    if (allocated == NULL) {
+        yp_diagnostic_list_append(error_list, value, value + length, "Failed to allocate memory for unescaping.");
+        return;
+    }
+    // This is the memory address where we're putting the unescaped string.
+    char *dest = allocated;
+    size_t dest_length = 0;
+    // This is the current position in the source string that we're looking at.
+    // It's going to move along behind the backslash so that we can copy each
+    // segment of the string that doesn't contain an escape.
+    const char *cursor = value;
+    const char *end = value + length;
+    // For each escape found in the source string, we will handle it and update
+    // the moving cursor->backslash window.
+    while (backslash != NULL && backslash + 1 < end) {
+        assert(dest_length < length);
+        // This is the size of the segment of the string from the previous escape
+        // or the start of the string to the current escape.
+        size_t segment_size = (size_t) (backslash - cursor);
+        // Here we're going to copy everything up until the escape into the
+        // destination buffer.
+        memcpy(dest + dest_length, cursor, segment_size);
+        dest_length += segment_size;
+        switch (backslash[1]) {
+            case '\\':
+            case '\'':
+                dest[dest_length++] = (char) unescape_chars[(unsigned char) backslash[1]];
+                cursor = backslash + 2;
+                break;
+            default:
+                if (unescape_type == YP_UNESCAPE_MINIMAL) {
+                    // In this case we're escaping something that doesn't need escaping.
+                    dest[dest_length++] = '\\';
+                    cursor = backslash + 1;
+                    break;
+                }
+                // This is the only type of unescaping left. In this case we need to
+                // handle all of the different unescapes.
+                assert(unescape_type == YP_UNESCAPE_ALL);
+                cursor = unescape(dest, &dest_length, backslash, end, error_list, YP_UNESCAPE_FLAG_NONE, true);
+                break;
+        }
+        if (end > cursor) {
+            backslash = yp_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding);
+        } else {
+            backslash = NULL;
+        }
+    }
+    // We need to copy the final segment of the string after the last escape.
+    if (end > cursor) {
+        memcpy(dest + dest_length, cursor, (size_t) (end - cursor));
+    } else {
+        cursor = end;
+    }
+    // We also need to update the length at the end. This is because every escape
+    // reduces the length of the final string, and we don't want garbage at the
+    // end.
+    yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
+}
+YP_EXPORTED_FUNCTION bool
+yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
+    bool success;
+    yp_list_t error_list;
+    yp_list_init(&error_list);
+    yp_parser_t parser;
+    yp_parser_init(&parser, start, length, "");
+    yp_unescape_manipulate_string(&parser, start, length, result, unescape_type, &error_list);
+    success = yp_list_empty_p(&error_list);
+    yp_list_free(&error_list);
+    yp_parser_free(&parser);
+    return success;
+}
+// This function is similar to yp_unescape_manipulate_string, except it doesn't
+// actually perform any string manipulations. Instead, it calculates how long
+// the unescaped character is, and returns that value
+YP_EXPORTED_FUNCTION size_t
+yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list) {
+    assert(unescape_type != YP_UNESCAPE_NONE);
+    switch (backslash[1]) {
+        case '\\':
+        case '\'':
+            return 2;
+        default: {
+            if (unescape_type == YP_UNESCAPE_MINIMAL) return 2;
+            // This is the only type of unescaping left. In this case we need to
+            // handle all of the different unescapes.
+            assert(unescape_type == YP_UNESCAPE_ALL);
+            unsigned char flags = YP_UNESCAPE_FLAG_NONE;
+            if (expect_single_codepoint)
+                flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
+            const char *cursor = unescape(NULL, 0, backslash, end, error_list, flags, false);
+            assert(cursor > backslash);
+            return (size_t) (cursor - backslash);
+        }
+    }
+}

data/src/util/yp_buffer.c ADDED Viewed

@@ -0,0 +1,78 @@
+#include "yarp/util/yp_buffer.h"
+#define YP_BUFFER_INITIAL_SIZE 1024
+// Initialize a yp_buffer_t with its default values.
+bool
+yp_buffer_init(yp_buffer_t *buffer) {
+    buffer->length = 0;
+    buffer->capacity = YP_BUFFER_INITIAL_SIZE;
+    buffer->value = (char *) malloc(YP_BUFFER_INITIAL_SIZE);
+    return buffer->value != NULL;
+}
+// Append the given amount of space to the buffer.
+static inline void
+yp_buffer_append_length(yp_buffer_t *buffer, size_t length) {
+    size_t next_length = buffer->length + length;
+    if (next_length > buffer->capacity) {
+        do {
+            buffer->capacity *= 2;
+        } while (next_length > buffer->capacity);
+        buffer->value = realloc(buffer->value, buffer->capacity);
+    }
+    buffer->length = next_length;
+}
+// Append a generic pointer to memory to the buffer.
+static inline void
+yp_buffer_append(yp_buffer_t *buffer, const void *source, size_t length) {
+    yp_buffer_append_length(buffer, length);
+    memcpy(buffer->value + (buffer->length - length), source, length);
+}
+// Append the given amount of space as zeroes to the buffer.
+void
+yp_buffer_append_zeroes(yp_buffer_t *buffer, size_t length) {
+    yp_buffer_append_length(buffer, length);
+    memset(buffer->value + (buffer->length - length), 0, length);
+}
+// Append a string to the buffer.
+void
+yp_buffer_append_str(yp_buffer_t *buffer, const char *value, size_t length) {
+    const void *source = value;
+    yp_buffer_append(buffer, source, length);
+}
+// Append a single byte to the buffer.
+void
+yp_buffer_append_u8(yp_buffer_t *buffer, uint8_t value) {
+    const void *source = &value;
+    yp_buffer_append(buffer, source, sizeof(uint8_t));
+}
+// Append a 32-bit unsigned integer to the buffer.
+void
+yp_buffer_append_u32(yp_buffer_t *buffer, uint32_t value) {
+    if (value < 128) {
+        yp_buffer_append_u8(buffer, (uint8_t) value);
+    } else {
+        uint32_t n = value;
+        while (n >= 128) {
+            yp_buffer_append_u8(buffer, (uint8_t) (n | 128));
+            n >>= 7;
+        }
+        yp_buffer_append_u8(buffer, (uint8_t) n);
+    }
+}
+// Free the memory associated with the buffer.
+void
+yp_buffer_free(yp_buffer_t *buffer) {
+    free(buffer->value);
+}