nokogumbo 1.5.0 → 2.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +237 -26
- data/ext/nokogumbo/extconf.rb +144 -0
- data/ext/nokogumbo/nokogumbo.c +793 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +5972 -6816
- data/gumbo-parser/src/char_ref.h +14 -45
- data/gumbo-parser/src/error.c +510 -163
- data/gumbo-parser/src/error.h +70 -147
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +577 -305
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +2922 -2228
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +2128 -1562
- data/gumbo-parser/src/tokenizer.h +41 -52
- data/gumbo-parser/src/tokenizer_states.h +281 -45
- data/gumbo-parser/src/utf8.c +98 -123
- data/gumbo-parser/src/utf8.h +84 -52
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +18 -170
- data/lib/nokogumbo/html5.rb +252 -0
- data/lib/nokogumbo/html5/document.rb +53 -0
- data/lib/nokogumbo/html5/document_fragment.rb +62 -0
- data/lib/nokogumbo/html5/node.rb +72 -0
- data/lib/nokogumbo/version.rb +3 -0
- metadata +40 -21
- data/ext/nokogumboc/extconf.rb +0 -60
- data/ext/nokogumboc/nokogumbo.c +0 -295
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
    
        data/gumbo-parser/src/error.h
    CHANGED
    
    | @@ -1,32 +1,13 @@ | |
| 1 | 
            -
            // Copyright 2010 Google Inc. All Rights Reserved.
         | 
| 2 | 
            -
            //
         | 
| 3 | 
            -
            // Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 4 | 
            -
            // you may not use this file except in compliance with the License.
         | 
| 5 | 
            -
            // You may obtain a copy of the License at
         | 
| 6 | 
            -
            //
         | 
| 7 | 
            -
            //     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 8 | 
            -
            //
         | 
| 9 | 
            -
            // Unless required by applicable law or agreed to in writing, software
         | 
| 10 | 
            -
            // distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 11 | 
            -
            // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 12 | 
            -
            // See the License for the specific language governing permissions and
         | 
| 13 | 
            -
            // limitations under the License.
         | 
| 14 | 
            -
            //
         | 
| 15 | 
            -
            // Author: jdtang@google.com (Jonathan Tang)
         | 
| 16 | 
            -
            //
         | 
| 17 | 
            -
            // Error types, enums, and handling functions.
         | 
| 18 | 
            -
             | 
| 19 1 | 
             
            #ifndef GUMBO_ERROR_H_
         | 
| 20 2 | 
             
            #define GUMBO_ERROR_H_
         | 
| 21 | 
            -
             | 
| 22 | 
            -
            #define _CRT_SECURE_NO_WARNINGS
         | 
| 23 | 
            -
            #endif
         | 
| 3 | 
            +
             | 
| 24 4 | 
             
            #include <stdint.h>
         | 
| 25 5 |  | 
| 26 6 | 
             
            #include "gumbo.h"
         | 
| 27 7 | 
             
            #include "insertion_mode.h"
         | 
| 28 8 | 
             
            #include "string_buffer.h"
         | 
| 29 9 | 
             
            #include "token_type.h"
         | 
| 10 | 
            +
            #include "tokenizer_states.h"
         | 
| 30 11 |  | 
| 31 12 | 
             
            #ifdef __cplusplus
         | 
| 32 13 | 
             
            extern "C" {
         | 
| @@ -35,84 +16,66 @@ extern "C" { | |
| 35 16 | 
             
            struct GumboInternalParser;
         | 
| 36 17 |  | 
| 37 18 | 
             
            typedef enum {
         | 
| 19 | 
            +
              // Defined errors.
         | 
| 20 | 
            +
              // https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
         | 
| 21 | 
            +
              GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
         | 
| 22 | 
            +
              GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
         | 
| 23 | 
            +
              GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
         | 
| 24 | 
            +
              GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
         | 
| 25 | 
            +
              GUMBO_ERR_CDATA_IN_HTML_CONTENT,
         | 
| 26 | 
            +
              GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
         | 
| 27 | 
            +
              GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
         | 
| 28 | 
            +
              GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
         | 
| 29 | 
            +
              GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
         | 
| 30 | 
            +
              GUMBO_ERR_DUPLICATE_ATTRIBUTE,
         | 
| 31 | 
            +
              GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
         | 
| 32 | 
            +
              GUMBO_ERR_EOF_BEFORE_TAG_NAME,
         | 
| 33 | 
            +
              GUMBO_ERR_EOF_IN_CDATA,
         | 
| 34 | 
            +
              GUMBO_ERR_EOF_IN_COMMENT,
         | 
| 35 | 
            +
              GUMBO_ERR_EOF_IN_DOCTYPE,
         | 
| 36 | 
            +
              GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
         | 
| 37 | 
            +
              GUMBO_ERR_EOF_IN_TAG,
         | 
| 38 | 
            +
              GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
         | 
| 39 | 
            +
              GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
         | 
| 40 | 
            +
              GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
         | 
| 41 | 
            +
              GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
         | 
| 42 | 
            +
              GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
         | 
| 43 | 
            +
              GUMBO_ERR_MISSING_DOCTYPE_NAME,
         | 
| 44 | 
            +
              GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
         | 
| 45 | 
            +
              GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
         | 
| 46 | 
            +
              GUMBO_ERR_MISSING_END_TAG_NAME,
         | 
| 47 | 
            +
              GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
         | 
| 48 | 
            +
              GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
         | 
| 49 | 
            +
              GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
         | 
| 50 | 
            +
              GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
         | 
| 51 | 
            +
              GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
         | 
| 52 | 
            +
              GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
         | 
| 53 | 
            +
              GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
         | 
| 54 | 
            +
              GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
         | 
| 55 | 
            +
              GUMBO_ERR_NESTED_COMMENT,
         | 
| 56 | 
            +
              GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
         | 
| 57 | 
            +
              GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
         | 
| 58 | 
            +
              GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
         | 
| 59 | 
            +
              GUMBO_ERR_NULL_CHARACTER_REFERENCE,
         | 
| 60 | 
            +
              GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
         | 
| 61 | 
            +
              GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
         | 
| 62 | 
            +
              GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
         | 
| 63 | 
            +
              GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
         | 
| 64 | 
            +
              GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
         | 
| 65 | 
            +
              GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
         | 
| 66 | 
            +
              GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
         | 
| 67 | 
            +
              GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
         | 
| 68 | 
            +
              GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
         | 
| 69 | 
            +
              GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
         | 
| 70 | 
            +
             | 
| 71 | 
            +
              // Encoding errors.
         | 
| 38 72 | 
             
              GUMBO_ERR_UTF8_INVALID,
         | 
| 39 73 | 
             
              GUMBO_ERR_UTF8_TRUNCATED,
         | 
| 40 | 
            -
             | 
| 41 | 
            -
               | 
| 42 | 
            -
              GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
         | 
| 43 | 
            -
              GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
         | 
| 44 | 
            -
              GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
         | 
| 45 | 
            -
              GUMBO_ERR_NAMED_CHAR_REF_INVALID,
         | 
| 46 | 
            -
              GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
         | 
| 47 | 
            -
              GUMBO_ERR_TAG_EOF,
         | 
| 48 | 
            -
              GUMBO_ERR_TAG_INVALID,
         | 
| 49 | 
            -
              GUMBO_ERR_CLOSE_TAG_EMPTY,
         | 
| 50 | 
            -
              GUMBO_ERR_CLOSE_TAG_EOF,
         | 
| 51 | 
            -
              GUMBO_ERR_CLOSE_TAG_INVALID,
         | 
| 52 | 
            -
              GUMBO_ERR_SCRIPT_EOF,
         | 
| 53 | 
            -
              GUMBO_ERR_ATTR_NAME_EOF,
         | 
| 54 | 
            -
              GUMBO_ERR_ATTR_NAME_INVALID,
         | 
| 55 | 
            -
              GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
         | 
| 56 | 
            -
              GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
         | 
| 57 | 
            -
              GUMBO_ERR_ATTR_UNQUOTED_EOF,
         | 
| 58 | 
            -
              GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
         | 
| 59 | 
            -
              GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
         | 
| 60 | 
            -
              GUMBO_ERR_ATTR_AFTER_EOF,
         | 
| 61 | 
            -
              GUMBO_ERR_ATTR_AFTER_INVALID,
         | 
| 62 | 
            -
              GUMBO_ERR_DUPLICATE_ATTR,
         | 
| 63 | 
            -
              GUMBO_ERR_SOLIDUS_EOF,
         | 
| 64 | 
            -
              GUMBO_ERR_SOLIDUS_INVALID,
         | 
| 65 | 
            -
              GUMBO_ERR_DASHES_OR_DOCTYPE,
         | 
| 66 | 
            -
              GUMBO_ERR_COMMENT_EOF,
         | 
| 67 | 
            -
              GUMBO_ERR_COMMENT_INVALID,
         | 
| 68 | 
            -
              GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
         | 
| 69 | 
            -
              GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
         | 
| 70 | 
            -
              GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
         | 
| 71 | 
            -
              GUMBO_ERR_COMMENT_END_BANG_EOF,
         | 
| 72 | 
            -
              GUMBO_ERR_DOCTYPE_EOF,
         | 
| 73 | 
            -
              GUMBO_ERR_DOCTYPE_INVALID,
         | 
| 74 | 
            -
              GUMBO_ERR_DOCTYPE_SPACE,
         | 
| 75 | 
            -
              GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
         | 
| 76 | 
            -
              GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
         | 
| 77 | 
            -
              GUMBO_ERR_DOCTYPE_END,
         | 
| 74 | 
            +
             | 
| 75 | 
            +
              // Generic parser error.
         | 
| 78 76 | 
             
              GUMBO_ERR_PARSER,
         | 
| 79 | 
            -
              GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
         | 
| 80 77 | 
             
            } GumboErrorType;
         | 
| 81 78 |  | 
| 82 | 
            -
            // Additional data for duplicated attributes.
         | 
| 83 | 
            -
            typedef struct GumboInternalDuplicateAttrError {
         | 
| 84 | 
            -
              // The name of the attribute.  Owned by this struct.
         | 
| 85 | 
            -
              const char* name;
         | 
| 86 | 
            -
             | 
| 87 | 
            -
              // The (0-based) index within the attributes vector of the original
         | 
| 88 | 
            -
              // occurrence.
         | 
| 89 | 
            -
              unsigned int original_index;
         | 
| 90 | 
            -
             | 
| 91 | 
            -
              // The (0-based) index where the new occurrence would be.
         | 
| 92 | 
            -
              unsigned int new_index;
         | 
| 93 | 
            -
            } GumboDuplicateAttrError;
         | 
| 94 | 
            -
             | 
| 95 | 
            -
            // A simplified representation of the tokenizer state, designed to be more
         | 
| 96 | 
            -
            // useful to clients of this library than the internal representation.  This
         | 
| 97 | 
            -
            // condenses the actual states used in the tokenizer state machine into a few
         | 
| 98 | 
            -
            // values that will be familiar to users of HTML.
         | 
| 99 | 
            -
            typedef enum {
         | 
| 100 | 
            -
              GUMBO_ERR_TOKENIZER_DATA,
         | 
| 101 | 
            -
              GUMBO_ERR_TOKENIZER_CHAR_REF,
         | 
| 102 | 
            -
              GUMBO_ERR_TOKENIZER_RCDATA,
         | 
| 103 | 
            -
              GUMBO_ERR_TOKENIZER_RAWTEXT,
         | 
| 104 | 
            -
              GUMBO_ERR_TOKENIZER_PLAINTEXT,
         | 
| 105 | 
            -
              GUMBO_ERR_TOKENIZER_SCRIPT,
         | 
| 106 | 
            -
              GUMBO_ERR_TOKENIZER_TAG,
         | 
| 107 | 
            -
              GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
         | 
| 108 | 
            -
              GUMBO_ERR_TOKENIZER_ATTR_NAME,
         | 
| 109 | 
            -
              GUMBO_ERR_TOKENIZER_ATTR_VALUE,
         | 
| 110 | 
            -
              GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
         | 
| 111 | 
            -
              GUMBO_ERR_TOKENIZER_COMMENT,
         | 
| 112 | 
            -
              GUMBO_ERR_TOKENIZER_DOCTYPE,
         | 
| 113 | 
            -
              GUMBO_ERR_TOKENIZER_CDATA,
         | 
| 114 | 
            -
            } GumboTokenizerErrorState;
         | 
| 115 | 
            -
             | 
| 116 79 | 
             
            // Additional data for tokenizer errors.
         | 
| 117 80 | 
             
            // This records the current state and codepoint encountered - this is usually
         | 
| 118 81 | 
             
            // enough to reconstruct what went wrong and provide a friendly error message.
         | 
| @@ -121,7 +84,7 @@ typedef struct GumboInternalTokenizerError { | |
| 121 84 | 
             
              int codepoint;
         | 
| 122 85 |  | 
| 123 86 | 
             
              // The state that the tokenizer was in at the time.
         | 
| 124 | 
            -
               | 
| 87 | 
            +
              GumboTokenizerEnum state;
         | 
| 125 88 | 
             
            } GumboTokenizerError;
         | 
| 126 89 |  | 
| 127 90 | 
             
            // Additional data for parse errors.
         | 
| @@ -129,61 +92,43 @@ typedef struct GumboInternalParserError { | |
| 129 92 | 
             
              // The type of input token that resulted in this error.
         | 
| 130 93 | 
             
              GumboTokenType input_type;
         | 
| 131 94 |  | 
| 132 | 
            -
              // The HTML tag of the input token. | 
| 95 | 
            +
              // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
         | 
| 133 96 | 
             
              GumboTag input_tag;
         | 
| 134 97 |  | 
| 135 98 | 
             
              // The insertion mode that the parser was in at the time.
         | 
| 136 99 | 
             
              GumboInsertionMode parser_state;
         | 
| 137 100 |  | 
| 138 | 
            -
              // The tag stack at the point of the error. | 
| 101 | 
            +
              // The tag stack at the point of the error. Note that this is an GumboVector
         | 
| 139 102 | 
             
              // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
         | 
| 140 103 | 
             
              // get at the tag.
         | 
| 141 104 | 
             
              GumboVector /* GumboTag */ tag_stack;
         | 
| 142 105 | 
             
            } GumboParserError;
         | 
| 143 106 |  | 
| 144 107 | 
             
            // The overall error struct representing an error in decoding/tokenizing/parsing
         | 
| 145 | 
            -
            // the HTML. | 
| 108 | 
            +
            // the HTML. This contains an enumerated type flag, a source position, and then
         | 
| 146 109 | 
             
            // a union of fields containing data specific to the error.
         | 
| 147 | 
            -
             | 
| 110 | 
            +
            struct GumboInternalError {
         | 
| 148 111 | 
             
              // The type of error.
         | 
| 149 112 | 
             
              GumboErrorType type;
         | 
| 150 113 |  | 
| 151 114 | 
             
              // The position within the source file where the error occurred.
         | 
| 152 115 | 
             
              GumboSourcePosition position;
         | 
| 153 116 |  | 
| 154 | 
            -
              //  | 
| 155 | 
            -
               | 
| 156 | 
            -
              // character-based instead of byte-based offsets).
         | 
| 157 | 
            -
              const char* original_text;
         | 
| 117 | 
            +
              // The piece of text that caused the error.
         | 
| 118 | 
            +
              GumboStringPiece original_text;
         | 
| 158 119 |  | 
| 159 120 | 
             
              // Type-specific error information.
         | 
| 160 121 | 
             
              union {
         | 
| 161 | 
            -
                // The code point we encountered, for:
         | 
| 162 | 
            -
                // * GUMBO_ERR_UTF8_INVALID
         | 
| 163 | 
            -
                // * GUMBO_ERR_UTF8_TRUNCATED
         | 
| 164 | 
            -
                // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
         | 
| 165 | 
            -
                // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
         | 
| 166 | 
            -
                uint64_t codepoint;
         | 
| 167 | 
            -
             | 
| 168 122 | 
             
                // Tokenizer errors.
         | 
| 169 123 | 
             
                GumboTokenizerError tokenizer;
         | 
| 170 124 |  | 
| 171 | 
            -
                //  | 
| 172 | 
            -
                 | 
| 173 | 
            -
                // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
         | 
| 174 | 
            -
                GumboStringPiece text;
         | 
| 175 | 
            -
             | 
| 176 | 
            -
                // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
         | 
| 177 | 
            -
                GumboDuplicateAttrError duplicate_attr;
         | 
| 178 | 
            -
             | 
| 179 | 
            -
                // Parser state, for GUMBO_ERR_PARSER and
         | 
| 180 | 
            -
                // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
         | 
| 181 | 
            -
                struct GumboInternalParserError parser;
         | 
| 125 | 
            +
                // Parser errors.
         | 
| 126 | 
            +
                GumboParserError parser;
         | 
| 182 127 | 
             
              } v;
         | 
| 183 | 
            -
            } | 
| 128 | 
            +
            };
         | 
| 184 129 |  | 
| 185 130 | 
             
            // Adds a new error to the parser's error list, and returns a pointer to it so
         | 
| 186 | 
            -
            // that clients can fill out the rest of its fields. | 
| 131 | 
            +
            // that clients can fill out the rest of its fields. May return NULL if we're
         | 
| 187 132 | 
             
            // already over the max_errors field specified in GumboOptions.
         | 
| 188 133 | 
             
            GumboError* gumbo_add_error(struct GumboInternalParser* parser);
         | 
| 189 134 |  | 
| @@ -194,32 +139,10 @@ void gumbo_init_errors(struct GumboInternalParser* errors); | |
| 194 139 | 
             
            void gumbo_destroy_errors(struct GumboInternalParser* errors);
         | 
| 195 140 |  | 
| 196 141 | 
             
            // Frees the memory used for a single GumboError.
         | 
| 197 | 
            -
            void gumbo_error_destroy( | 
| 198 | 
            -
             | 
| 199 | 
            -
            // Prints an error to a string.  This fills an empty GumboStringBuffer with a
         | 
| 200 | 
            -
            // freshly-allocated buffer containing the error message text.  The caller is
         | 
| 201 | 
            -
            // responsible for deleting the buffer.  (Note that the buffer is allocated with
         | 
| 202 | 
            -
            // the allocator specified in the GumboParser config and hence should be freed
         | 
| 203 | 
            -
            // by gumbo_parser_deallocate().)
         | 
| 204 | 
            -
            void gumbo_error_to_string(struct GumboInternalParser* parser,
         | 
| 205 | 
            -
                const GumboError* error, GumboStringBuffer* output);
         | 
| 206 | 
            -
             | 
| 207 | 
            -
            // Prints a caret diagnostic to a string.  This fills an empty GumboStringBuffer
         | 
| 208 | 
            -
            // with a freshly-allocated buffer containing the error message text.  The
         | 
| 209 | 
            -
            // caller is responsible for deleting the buffer.  (Note that the buffer is
         | 
| 210 | 
            -
            // allocated with the allocator specified in the GumboParser config and hence
         | 
| 211 | 
            -
            // should be freed by gumbo_parser_deallocate().)
         | 
| 212 | 
            -
            void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
         | 
| 213 | 
            -
                const GumboError* error, const char* source_text,
         | 
| 214 | 
            -
                GumboStringBuffer* output);
         | 
| 215 | 
            -
             | 
| 216 | 
            -
            // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
         | 
| 217 | 
            -
            // of writing to a string.
         | 
| 218 | 
            -
            void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
         | 
| 219 | 
            -
                const GumboError* error, const char* source_text);
         | 
| 142 | 
            +
            void gumbo_error_destroy(GumboError* error);
         | 
| 220 143 |  | 
| 221 144 | 
             
            #ifdef __cplusplus
         | 
| 222 145 | 
             
            }
         | 
| 223 146 | 
             
            #endif
         | 
| 224 147 |  | 
| 225 | 
            -
            #endif | 
| 148 | 
            +
            #endif // GUMBO_ERROR_H_
         | 
| @@ -0,0 +1,104 @@ | |
| 1 | 
            +
            /* ANSI-C code produced by gperf version 3.1 */
         | 
| 2 | 
            +
            /* Command-line: gperf -m100 -n lib/foreign_attrs.gperf  */
         | 
| 3 | 
            +
            /* Computed positions: -k'2,8' */
         | 
| 4 | 
            +
            /* Filtered by: mk/gperf-filter.sed */
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            #include "replacement.h"
         | 
| 7 | 
            +
            #include "macros.h"
         | 
| 8 | 
            +
            #include <string.h>
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            #define TOTAL_KEYWORDS 11
         | 
| 11 | 
            +
            #define MIN_WORD_LENGTH 5
         | 
| 12 | 
            +
            #define MAX_WORD_LENGTH 13
         | 
| 13 | 
            +
            #define MIN_HASH_VALUE 0
         | 
| 14 | 
            +
            #define MAX_HASH_VALUE 10
         | 
| 15 | 
            +
            /* maximum key range = 11, duplicates = 0 */
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            static inline unsigned int
         | 
| 18 | 
            +
            hash (register const char *str, register size_t len)
         | 
| 19 | 
            +
            {
         | 
| 20 | 
            +
              static const unsigned char asso_values[] =
         | 
| 21 | 
            +
                {
         | 
| 22 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 23 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 24 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 25 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 26 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 27 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 28 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 29 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 30 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 31 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11,  2,
         | 
| 32 | 
            +
                  11, 10, 11,  9,  7,  6, 11, 11,  1,  0,
         | 
| 33 | 
            +
                  11,  5, 11, 11,  4, 11, 11, 11, 11, 11,
         | 
| 34 | 
            +
                  11,  3, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 35 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 36 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 37 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 38 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 39 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 40 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 41 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 42 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 43 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 44 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 45 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 46 | 
            +
                  11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
         | 
| 47 | 
            +
                  11, 11, 11, 11, 11, 11
         | 
| 48 | 
            +
                };
         | 
| 49 | 
            +
              register unsigned int hval = 0;
         | 
| 50 | 
            +
             | 
| 51 | 
            +
              switch (len)
         | 
| 52 | 
            +
                {
         | 
| 53 | 
            +
                  default:
         | 
| 54 | 
            +
                    hval += asso_values[(unsigned char)str[7]];
         | 
| 55 | 
            +
                  /*FALLTHROUGH*/
         | 
| 56 | 
            +
                  case 7:
         | 
| 57 | 
            +
                  case 6:
         | 
| 58 | 
            +
                  case 5:
         | 
| 59 | 
            +
                  case 4:
         | 
| 60 | 
            +
                  case 3:
         | 
| 61 | 
            +
                  case 2:
         | 
| 62 | 
            +
                    hval += asso_values[(unsigned char)str[1]];
         | 
| 63 | 
            +
                    break;
         | 
| 64 | 
            +
                }
         | 
| 65 | 
            +
              return hval;
         | 
| 66 | 
            +
            }
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            const ForeignAttrReplacement *
         | 
| 69 | 
            +
            gumbo_get_foreign_attr_replacement (register const char *str, register size_t len)
         | 
| 70 | 
            +
            {
         | 
| 71 | 
            +
              static const unsigned char lengthtable[] =
         | 
| 72 | 
            +
                {
         | 
| 73 | 
            +
                   5, 11,  9, 13, 10, 10, 10, 11, 10,  8,  8
         | 
| 74 | 
            +
                };
         | 
| 75 | 
            +
              static const ForeignAttrReplacement wordlist[] =
         | 
| 76 | 
            +
                {
         | 
| 77 | 
            +
                  {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
         | 
| 78 | 
            +
                  {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
         | 
| 79 | 
            +
                  {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
         | 
| 80 | 
            +
                  {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
         | 
| 81 | 
            +
                  {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
         | 
| 82 | 
            +
                  {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
         | 
| 83 | 
            +
                  {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
         | 
| 84 | 
            +
                  {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
         | 
| 85 | 
            +
                  {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
         | 
| 86 | 
            +
                  {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
         | 
| 87 | 
            +
                  {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}
         | 
| 88 | 
            +
                };
         | 
| 89 | 
            +
             | 
| 90 | 
            +
              if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
         | 
| 91 | 
            +
                {
         | 
| 92 | 
            +
                  register unsigned int key = hash (str, len);
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                  if (key <= MAX_HASH_VALUE)
         | 
| 95 | 
            +
                    if (len == lengthtable[key])
         | 
| 96 | 
            +
                      {
         | 
| 97 | 
            +
                        register const char *s = wordlist[key].from;
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                        if (s && *str == *s && !memcmp (str + 1, s + 1, len - 1))
         | 
| 100 | 
            +
                          return &wordlist[key];
         | 
| 101 | 
            +
                      }
         | 
| 102 | 
            +
                }
         | 
| 103 | 
            +
              return 0;
         | 
| 104 | 
            +
            }
         | 
    
        data/gumbo-parser/src/gumbo.h
    CHANGED
    
    | @@ -1,51 +1,33 @@ | |
| 1 | 
            -
            // Copyright 2010 Google Inc. | 
| 2 | 
            -
            //
         | 
| 3 | 
            -
            // Licensed under the Apache License,  | 
| 4 | 
            -
             | 
| 5 | 
            -
            //  | 
| 6 | 
            -
            //
         | 
| 7 | 
            -
            // | 
| 8 | 
            -
            //
         | 
| 9 | 
            -
            // Unless required by applicable law or agreed to in writing, software
         | 
| 10 | 
            -
            // distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 11 | 
            -
            // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 12 | 
            -
            // See the License for the specific language governing permissions and
         | 
| 13 | 
            -
            // limitations under the License.
         | 
| 14 | 
            -
            //
         | 
| 15 | 
            -
            // Author: jdtang@google.com (Jonathan Tang)
         | 
| 16 | 
            -
            //
         | 
| 17 | 
            -
            // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
         | 
| 18 | 
            -
            // GUMBO_ as a prefix for enum constants (static constants get the Google-style
         | 
| 19 | 
            -
            // kGumbo prefix).
         | 
| 1 | 
            +
            // Copyright 2010 Google Inc.
         | 
| 2 | 
            +
            // Copyright 2018 Craig Barnes.
         | 
| 3 | 
            +
            // Licensed under the Apache License, version 2.0.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
         | 
| 6 | 
            +
            // GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
         | 
| 7 | 
            +
            // static constants
         | 
| 20 8 |  | 
| 21 9 | 
             
            /**
         | 
| 22 10 | 
             
             * @file
         | 
| 23 11 | 
             
             * @mainpage Gumbo HTML Parser
         | 
| 24 12 | 
             
             *
         | 
| 25 | 
            -
             * This provides a conformant, no-dependencies implementation of the | 
| 26 | 
            -
             * parsing algorithm. | 
| 27 | 
            -
             * encoding, run a preprocessing step to convert | 
| 28 | 
            -
             * tree made of the structs in this file.
         | 
| 13 | 
            +
             * This provides a conformant, no-dependencies implementation of the
         | 
| 14 | 
            +
             * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
         | 
| 15 | 
            +
             * to parse a different encoding, run a preprocessing step to convert
         | 
| 16 | 
            +
             * to UTF-8. It returns a parse tree made of the structs in this file.
         | 
| 29 17 | 
             
             *
         | 
| 30 18 | 
             
             * Example:
         | 
| 31 19 | 
             
             * @code
         | 
| 32 20 | 
             
             *    GumboOutput* output = gumbo_parse(input);
         | 
| 33 21 | 
             
             *    do_something_with_doctype(output->document);
         | 
| 34 22 | 
             
             *    do_something_with_html_tree(output->root);
         | 
| 35 | 
            -
             *    gumbo_destroy_output( | 
| 23 | 
            +
             *    gumbo_destroy_output(output);
         | 
| 36 24 | 
             
             * @endcode
         | 
| 37 | 
            -
             * HTML5 Spec:
         | 
| 38 25 | 
             
             *
         | 
| 39 | 
            -
             *  | 
| 26 | 
            +
             * [HTML5]: https://html.spec.whatwg.org/multipage/
         | 
| 40 27 | 
             
             */
         | 
| 41 28 |  | 
| 42 | 
            -
            #ifndef  | 
| 43 | 
            -
            #define  | 
| 44 | 
            -
             | 
| 45 | 
            -
            #ifdef _MSC_VER
         | 
| 46 | 
            -
            #define _CRT_SECURE_NO_WARNINGS
         | 
| 47 | 
            -
            #define fileno _fileno
         | 
| 48 | 
            -
            #endif
         | 
| 29 | 
            +
            #ifndef GUMBO_H
         | 
| 30 | 
            +
            #define GUMBO_H
         | 
| 49 31 |  | 
| 50 32 | 
             
            #include <stdbool.h>
         | 
| 51 33 | 
             
            #include <stddef.h>
         | 
| @@ -55,73 +37,77 @@ extern "C" { | |
| 55 37 | 
             
            #endif
         | 
| 56 38 |  | 
| 57 39 | 
             
            /**
         | 
| 58 | 
            -
             * A struct representing a character position within the original text | 
| 59 | 
            -
             * Line and column numbers are 1-based and offsets are 0-based, | 
| 60 | 
            -
             * how most editors and command-line tools work. | 
| 61 | 
            -
             * positions in terms of characters while offsets measure by bytes; this is
         | 
| 62 | 
            -
             * because the offset field is often used to pull out a particular region of
         | 
| 63 | 
            -
             * text (which in most languages that bind to C implies pointer arithmetic on a
         | 
| 64 | 
            -
             * buffer of bytes), while the column field is often used to reference a
         | 
| 65 | 
            -
             * particular column on a printable display, which nowadays is usually UTF-8.
         | 
| 40 | 
            +
             * A struct representing a character position within the original text
         | 
| 41 | 
            +
             * buffer. Line and column numbers are 1-based and offsets are 0-based,
         | 
| 42 | 
            +
             * which matches how most editors and command-line tools work.
         | 
| 66 43 | 
             
             */
         | 
| 67 44 | 
             
            typedef struct {
         | 
| 68 | 
            -
               | 
| 69 | 
            -
               | 
| 70 | 
            -
               | 
| 45 | 
            +
              size_t line;
         | 
| 46 | 
            +
              size_t column;
         | 
| 47 | 
            +
              size_t offset;
         | 
| 71 48 | 
             
            } GumboSourcePosition;
         | 
| 72 49 |  | 
| 73 50 | 
             
            /**
         | 
| 74 | 
            -
             * A  | 
| 75 | 
            -
             * parser | 
| 76 | 
            -
              | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
             | 
| 80 | 
            -
             *  | 
| 81 | 
            -
             * parser are represented by a char* and a length; the char* points into
         | 
| 82 | 
            -
             * an existing data buffer owned by some other code (often the original input).
         | 
| 83 | 
            -
             * GumboStringPieces are assumed (by convention) to be immutable, because they
         | 
| 84 | 
            -
             * may share data.  Use GumboStringBuffer if you need to construct a string.
         | 
| 85 | 
            -
             * Clients should assume that it is not NUL-terminated, and should always use
         | 
| 86 | 
            -
             * explicit lengths when manipulating them.
         | 
| 51 | 
            +
             * A struct representing a string or part of a string. Strings within
         | 
| 52 | 
            +
             * the parser are represented by a `char*` and a length; the `char*`
         | 
| 53 | 
            +
             * points into an existing data buffer owned by some other code (often
         | 
| 54 | 
            +
             * the original input). `GumboStringPiece`s are assumed (by convention)
         | 
| 55 | 
            +
             * to be immutable, because they may share data. Clients should assume
         | 
| 56 | 
            +
             * that it is not NUL-terminated and should always use explicit lengths
         | 
| 57 | 
            +
             * when manipulating them.
         | 
| 87 58 | 
             
             */
         | 
| 88 59 | 
             
            typedef struct {
         | 
| 89 | 
            -
              /** A pointer to the beginning of the string. | 
| 60 | 
            +
              /** A pointer to the beginning of the string. `NULL` if `length == 0`. */
         | 
| 90 61 | 
             
              const char* data;
         | 
| 91 62 |  | 
| 92 | 
            -
              /** The length of the string fragment, in bytes | 
| 63 | 
            +
              /** The length of the string fragment, in bytes (may be zero). */
         | 
| 93 64 | 
             
              size_t length;
         | 
| 94 65 | 
             
            } GumboStringPiece;
         | 
| 95 66 |  | 
| 67 | 
            +
            #define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
         | 
| 96 68 | 
             
            /** A constant to represent a 0-length null string. */
         | 
| 97 | 
            -
             | 
| 69 | 
            +
            #define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
         | 
| 98 70 |  | 
| 99 71 | 
             
            /**
         | 
| 100 | 
            -
             * Compares two  | 
| 101 | 
            -
             * otherwise.
         | 
| 72 | 
            +
             * Compares two `GumboStringPiece`s, and returns `true` if they're
         | 
| 73 | 
            +
             * equal or `false` otherwise.
         | 
| 102 74 | 
             
             */
         | 
| 103 | 
            -
            bool gumbo_string_equals(
         | 
| 104 | 
            -
             | 
| 75 | 
            +
            bool gumbo_string_equals (
         | 
| 76 | 
            +
              const GumboStringPiece* str1,
         | 
| 77 | 
            +
              const GumboStringPiece* str2
         | 
| 78 | 
            +
            );
         | 
| 105 79 |  | 
| 106 80 | 
             
            /**
         | 
| 107 | 
            -
             * Compares two  | 
| 108 | 
            -
             * equal or false otherwise.
         | 
| 81 | 
            +
             * Compares two `GumboStringPiece`s, ignoring case, and returns `true`
         | 
| 82 | 
            +
             * if they're equal or `false` otherwise.
         | 
| 109 83 | 
             
             */
         | 
| 110 | 
            -
            bool gumbo_string_equals_ignore_case(
         | 
| 111 | 
            -
             | 
| 84 | 
            +
            bool gumbo_string_equals_ignore_case (
         | 
| 85 | 
            +
              const GumboStringPiece* str1,
         | 
| 86 | 
            +
              const GumboStringPiece* str2
         | 
| 87 | 
            +
            );
         | 
| 112 88 |  | 
| 113 89 | 
             
            /**
         | 
| 114 | 
            -
             *  | 
| 115 | 
            -
             *  | 
| 116 | 
            -
              | 
| 117 | 
            -
              | 
| 118 | 
            -
             *  | 
| 119 | 
            -
             *  | 
| 120 | 
            -
             | 
| 90 | 
            +
             * Check if the first `GumboStringPiece` is a prefix of the second, ignoring
         | 
| 91 | 
            +
             * case.
         | 
| 92 | 
            +
             */
         | 
| 93 | 
            +
            bool gumbo_string_prefix_ignore_case (
         | 
| 94 | 
            +
              const GumboStringPiece* prefix,
         | 
| 95 | 
            +
              const GumboStringPiece* str
         | 
| 96 | 
            +
            );
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            /**
         | 
| 99 | 
            +
             * A simple vector implementation. This stores a pointer to a data array
         | 
| 100 | 
            +
             * and a length. All elements are stored as `void*`; client code must
         | 
| 101 | 
            +
             * cast to the appropriate type. Overflows upon addition result in
         | 
| 102 | 
            +
             * reallocation of the data array, with the size doubling to maintain
         | 
| 103 | 
            +
             * `O(1)` amortized cost. There is no removal function, as this isn't
         | 
| 104 | 
            +
             * needed for any of the operations within this library. Iteration can
         | 
| 105 | 
            +
             * be done through inspecting the structure directly in a `for` loop.
         | 
| 121 106 | 
             
             */
         | 
| 122 107 | 
             
            typedef struct {
         | 
| 123 | 
            -
              /** | 
| 124 | 
            -
               * elements | 
| 108 | 
            +
              /**
         | 
| 109 | 
            +
               * Data elements. This points to a dynamically-allocated array of
         | 
| 110 | 
            +
               * `capacity` elements, each a `void*` to the element itself.
         | 
| 125 111 | 
             
               */
         | 
| 126 112 | 
             
              void** data;
         | 
| 127 113 |  | 
| @@ -132,82 +118,230 @@ typedef struct { | |
| 132 118 | 
             
              unsigned int capacity;
         | 
| 133 119 | 
             
            } GumboVector;
         | 
| 134 120 |  | 
| 135 | 
            -
             | 
| 136 | 
            -
             | 
| 121 | 
            +
            # define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
         | 
| 122 | 
            +
            /** An empty (0-length, 0-capacity) `GumboVector`. */
         | 
| 123 | 
            +
            #define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
         | 
| 137 124 |  | 
| 138 125 | 
             
            /**
         | 
| 139 | 
            -
             * Returns the first index at which an element appears in this vector | 
| 140 | 
            -
             * by pointer equality), or  | 
| 126 | 
            +
             * Returns the first index at which an element appears in this vector
         | 
| 127 | 
            +
             * (testing by pointer equality), or `-1` if it never does.
         | 
| 141 128 | 
             
             */
         | 
| 142 129 | 
             
            int gumbo_vector_index_of(GumboVector* vector, const void* element);
         | 
| 143 130 |  | 
| 144 131 | 
             
            /**
         | 
| 145 | 
            -
             * An enum for all the tags defined in the HTML5 standard. | 
| 146 | 
            -
             * the tag names themselves. | 
| 147 | 
            -
             * the spec itself (or for tags with special | 
| 148 | 
            -
             *  | 
| 149 | 
            -
             * name can be obtained | 
| 132 | 
            +
             * An `enum` for all the tags defined in the HTML5 standard. These
         | 
| 133 | 
            +
             * correspond to the tag names themselves. Enum constants exist only
         | 
| 134 | 
            +
             * for tags that appear in the spec itself (or for tags with special
         | 
| 135 | 
            +
             * handling in the SVG and MathML namespaces). Any other tags appear
         | 
| 136 | 
            +
             * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
         | 
| 137 | 
            +
             * through `original_tag`.
         | 
| 150 138 | 
             
             *
         | 
| 151 | 
            -
             * This is mostly for API convenience, so that clients of this library | 
| 152 | 
            -
             * need to perform a strcasecmp to find the normalized tag | 
| 153 | 
            -
             * efficiency benefits, by letting the parser work | 
| 154 | 
            -
             * strings.
         | 
| 139 | 
            +
             * This is mostly for API convenience, so that clients of this library
         | 
| 140 | 
            +
             * don't need to perform a `strcasecmp` to find the normalized tag
         | 
| 141 | 
            +
             * name. It also has efficiency benefits, by letting the parser work
         | 
| 142 | 
            +
             * with enums instead of strings.
         | 
| 155 143 | 
             
             */
         | 
| 156 144 | 
             
            typedef enum {
         | 
| 157 | 
            -
             | 
| 158 | 
            -
             | 
| 159 | 
            -
               | 
| 160 | 
            -
               | 
| 145 | 
            +
              GUMBO_TAG_HTML,
         | 
| 146 | 
            +
              GUMBO_TAG_HEAD,
         | 
| 147 | 
            +
              GUMBO_TAG_TITLE,
         | 
| 148 | 
            +
              GUMBO_TAG_BASE,
         | 
| 149 | 
            +
              GUMBO_TAG_LINK,
         | 
| 150 | 
            +
              GUMBO_TAG_META,
         | 
| 151 | 
            +
              GUMBO_TAG_STYLE,
         | 
| 152 | 
            +
              GUMBO_TAG_SCRIPT,
         | 
| 153 | 
            +
              GUMBO_TAG_NOSCRIPT,
         | 
| 154 | 
            +
              GUMBO_TAG_TEMPLATE,
         | 
| 155 | 
            +
              GUMBO_TAG_BODY,
         | 
| 156 | 
            +
              GUMBO_TAG_ARTICLE,
         | 
| 157 | 
            +
              GUMBO_TAG_SECTION,
         | 
| 158 | 
            +
              GUMBO_TAG_NAV,
         | 
| 159 | 
            +
              GUMBO_TAG_ASIDE,
         | 
| 160 | 
            +
              GUMBO_TAG_H1,
         | 
| 161 | 
            +
              GUMBO_TAG_H2,
         | 
| 162 | 
            +
              GUMBO_TAG_H3,
         | 
| 163 | 
            +
              GUMBO_TAG_H4,
         | 
| 164 | 
            +
              GUMBO_TAG_H5,
         | 
| 165 | 
            +
              GUMBO_TAG_H6,
         | 
| 166 | 
            +
              GUMBO_TAG_HGROUP,
         | 
| 167 | 
            +
              GUMBO_TAG_HEADER,
         | 
| 168 | 
            +
              GUMBO_TAG_FOOTER,
         | 
| 169 | 
            +
              GUMBO_TAG_ADDRESS,
         | 
| 170 | 
            +
              GUMBO_TAG_P,
         | 
| 171 | 
            +
              GUMBO_TAG_HR,
         | 
| 172 | 
            +
              GUMBO_TAG_PRE,
         | 
| 173 | 
            +
              GUMBO_TAG_BLOCKQUOTE,
         | 
| 174 | 
            +
              GUMBO_TAG_OL,
         | 
| 175 | 
            +
              GUMBO_TAG_UL,
         | 
| 176 | 
            +
              GUMBO_TAG_LI,
         | 
| 177 | 
            +
              GUMBO_TAG_DL,
         | 
| 178 | 
            +
              GUMBO_TAG_DT,
         | 
| 179 | 
            +
              GUMBO_TAG_DD,
         | 
| 180 | 
            +
              GUMBO_TAG_FIGURE,
         | 
| 181 | 
            +
              GUMBO_TAG_FIGCAPTION,
         | 
| 182 | 
            +
              GUMBO_TAG_MAIN,
         | 
| 183 | 
            +
              GUMBO_TAG_DIV,
         | 
| 184 | 
            +
              GUMBO_TAG_A,
         | 
| 185 | 
            +
              GUMBO_TAG_EM,
         | 
| 186 | 
            +
              GUMBO_TAG_STRONG,
         | 
| 187 | 
            +
              GUMBO_TAG_SMALL,
         | 
| 188 | 
            +
              GUMBO_TAG_S,
         | 
| 189 | 
            +
              GUMBO_TAG_CITE,
         | 
| 190 | 
            +
              GUMBO_TAG_Q,
         | 
| 191 | 
            +
              GUMBO_TAG_DFN,
         | 
| 192 | 
            +
              GUMBO_TAG_ABBR,
         | 
| 193 | 
            +
              GUMBO_TAG_DATA,
         | 
| 194 | 
            +
              GUMBO_TAG_TIME,
         | 
| 195 | 
            +
              GUMBO_TAG_CODE,
         | 
| 196 | 
            +
              GUMBO_TAG_VAR,
         | 
| 197 | 
            +
              GUMBO_TAG_SAMP,
         | 
| 198 | 
            +
              GUMBO_TAG_KBD,
         | 
| 199 | 
            +
              GUMBO_TAG_SUB,
         | 
| 200 | 
            +
              GUMBO_TAG_SUP,
         | 
| 201 | 
            +
              GUMBO_TAG_I,
         | 
| 202 | 
            +
              GUMBO_TAG_B,
         | 
| 203 | 
            +
              GUMBO_TAG_U,
         | 
| 204 | 
            +
              GUMBO_TAG_MARK,
         | 
| 205 | 
            +
              GUMBO_TAG_RUBY,
         | 
| 206 | 
            +
              GUMBO_TAG_RT,
         | 
| 207 | 
            +
              GUMBO_TAG_RP,
         | 
| 208 | 
            +
              GUMBO_TAG_BDI,
         | 
| 209 | 
            +
              GUMBO_TAG_BDO,
         | 
| 210 | 
            +
              GUMBO_TAG_SPAN,
         | 
| 211 | 
            +
              GUMBO_TAG_BR,
         | 
| 212 | 
            +
              GUMBO_TAG_WBR,
         | 
| 213 | 
            +
              GUMBO_TAG_INS,
         | 
| 214 | 
            +
              GUMBO_TAG_DEL,
         | 
| 215 | 
            +
              GUMBO_TAG_IMAGE,
         | 
| 216 | 
            +
              GUMBO_TAG_IMG,
         | 
| 217 | 
            +
              GUMBO_TAG_IFRAME,
         | 
| 218 | 
            +
              GUMBO_TAG_EMBED,
         | 
| 219 | 
            +
              GUMBO_TAG_OBJECT,
         | 
| 220 | 
            +
              GUMBO_TAG_PARAM,
         | 
| 221 | 
            +
              GUMBO_TAG_VIDEO,
         | 
| 222 | 
            +
              GUMBO_TAG_AUDIO,
         | 
| 223 | 
            +
              GUMBO_TAG_SOURCE,
         | 
| 224 | 
            +
              GUMBO_TAG_TRACK,
         | 
| 225 | 
            +
              GUMBO_TAG_CANVAS,
         | 
| 226 | 
            +
              GUMBO_TAG_MAP,
         | 
| 227 | 
            +
              GUMBO_TAG_AREA,
         | 
| 228 | 
            +
              GUMBO_TAG_MATH,
         | 
| 229 | 
            +
              GUMBO_TAG_MI,
         | 
| 230 | 
            +
              GUMBO_TAG_MO,
         | 
| 231 | 
            +
              GUMBO_TAG_MN,
         | 
| 232 | 
            +
              GUMBO_TAG_MS,
         | 
| 233 | 
            +
              GUMBO_TAG_MTEXT,
         | 
| 234 | 
            +
              GUMBO_TAG_MGLYPH,
         | 
| 235 | 
            +
              GUMBO_TAG_MALIGNMARK,
         | 
| 236 | 
            +
              GUMBO_TAG_ANNOTATION_XML,
         | 
| 237 | 
            +
              GUMBO_TAG_SVG,
         | 
| 238 | 
            +
              GUMBO_TAG_FOREIGNOBJECT,
         | 
| 239 | 
            +
              GUMBO_TAG_DESC,
         | 
| 240 | 
            +
              GUMBO_TAG_TABLE,
         | 
| 241 | 
            +
              GUMBO_TAG_CAPTION,
         | 
| 242 | 
            +
              GUMBO_TAG_COLGROUP,
         | 
| 243 | 
            +
              GUMBO_TAG_COL,
         | 
| 244 | 
            +
              GUMBO_TAG_TBODY,
         | 
| 245 | 
            +
              GUMBO_TAG_THEAD,
         | 
| 246 | 
            +
              GUMBO_TAG_TFOOT,
         | 
| 247 | 
            +
              GUMBO_TAG_TR,
         | 
| 248 | 
            +
              GUMBO_TAG_TD,
         | 
| 249 | 
            +
              GUMBO_TAG_TH,
         | 
| 250 | 
            +
              GUMBO_TAG_FORM,
         | 
| 251 | 
            +
              GUMBO_TAG_FIELDSET,
         | 
| 252 | 
            +
              GUMBO_TAG_LEGEND,
         | 
| 253 | 
            +
              GUMBO_TAG_LABEL,
         | 
| 254 | 
            +
              GUMBO_TAG_INPUT,
         | 
| 255 | 
            +
              GUMBO_TAG_BUTTON,
         | 
| 256 | 
            +
              GUMBO_TAG_SELECT,
         | 
| 257 | 
            +
              GUMBO_TAG_DATALIST,
         | 
| 258 | 
            +
              GUMBO_TAG_OPTGROUP,
         | 
| 259 | 
            +
              GUMBO_TAG_OPTION,
         | 
| 260 | 
            +
              GUMBO_TAG_TEXTAREA,
         | 
| 261 | 
            +
              GUMBO_TAG_KEYGEN,
         | 
| 262 | 
            +
              GUMBO_TAG_OUTPUT,
         | 
| 263 | 
            +
              GUMBO_TAG_PROGRESS,
         | 
| 264 | 
            +
              GUMBO_TAG_METER,
         | 
| 265 | 
            +
              GUMBO_TAG_DETAILS,
         | 
| 266 | 
            +
              GUMBO_TAG_SUMMARY,
         | 
| 267 | 
            +
              GUMBO_TAG_MENU,
         | 
| 268 | 
            +
              GUMBO_TAG_MENUITEM,
         | 
| 269 | 
            +
              GUMBO_TAG_APPLET,
         | 
| 270 | 
            +
              GUMBO_TAG_ACRONYM,
         | 
| 271 | 
            +
              GUMBO_TAG_BGSOUND,
         | 
| 272 | 
            +
              GUMBO_TAG_DIR,
         | 
| 273 | 
            +
              GUMBO_TAG_FRAME,
         | 
| 274 | 
            +
              GUMBO_TAG_FRAMESET,
         | 
| 275 | 
            +
              GUMBO_TAG_NOFRAMES,
         | 
| 276 | 
            +
              GUMBO_TAG_LISTING,
         | 
| 277 | 
            +
              GUMBO_TAG_XMP,
         | 
| 278 | 
            +
              GUMBO_TAG_NEXTID,
         | 
| 279 | 
            +
              GUMBO_TAG_NOEMBED,
         | 
| 280 | 
            +
              GUMBO_TAG_PLAINTEXT,
         | 
| 281 | 
            +
              GUMBO_TAG_RB,
         | 
| 282 | 
            +
              GUMBO_TAG_STRIKE,
         | 
| 283 | 
            +
              GUMBO_TAG_BASEFONT,
         | 
| 284 | 
            +
              GUMBO_TAG_BIG,
         | 
| 285 | 
            +
              GUMBO_TAG_BLINK,
         | 
| 286 | 
            +
              GUMBO_TAG_CENTER,
         | 
| 287 | 
            +
              GUMBO_TAG_FONT,
         | 
| 288 | 
            +
              GUMBO_TAG_MARQUEE,
         | 
| 289 | 
            +
              GUMBO_TAG_MULTICOL,
         | 
| 290 | 
            +
              GUMBO_TAG_NOBR,
         | 
| 291 | 
            +
              GUMBO_TAG_SPACER,
         | 
| 292 | 
            +
              GUMBO_TAG_TT,
         | 
| 293 | 
            +
              GUMBO_TAG_RTC,
         | 
| 294 | 
            +
              GUMBO_TAG_DIALOG,
         | 
| 295 | 
            +
              // Used for all tags that don't have special handling in HTML.
         | 
| 161 296 | 
             
              GUMBO_TAG_UNKNOWN,
         | 
| 162 297 | 
             
              // A marker value to indicate the end of the enum, for iterating over it.
         | 
| 163 | 
            -
              // Also used as the terminator for varargs functions that take tags.
         | 
| 164 298 | 
             
              GUMBO_TAG_LAST,
         | 
| 165 299 | 
             
            } GumboTag;
         | 
| 166 300 |  | 
| 167 301 | 
             
            /**
         | 
| 168 | 
            -
             * Returns the normalized ( | 
| 169 | 
            -
             *  | 
| 170 | 
            -
             * library.
         | 
| 302 | 
            +
             * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
         | 
| 303 | 
            +
             * return value is static data owned by the library.
         | 
| 171 304 | 
             
             */
         | 
| 172 305 | 
             
            const char* gumbo_normalized_tagname(GumboTag tag);
         | 
| 173 306 |  | 
| 174 307 | 
             
            /**
         | 
| 175 | 
            -
             * Extracts the tag name from the original_text field of an element | 
| 176 | 
            -
             * stripping off  | 
| 177 | 
            -
             * GumboStringPiece appropriately. | 
| 178 | 
            -
             * shares a buffer with the original | 
| 179 | 
            -
             * Behavior is undefined if a | 
| 180 | 
            -
             *  | 
| 181 | 
            -
             *  | 
| 182 | 
            -
             *  | 
| 308 | 
            +
             * Extracts the tag name from the `original_text` field of an element
         | 
| 309 | 
            +
             * or token by stripping off `</>` characters and attributes and
         | 
| 310 | 
            +
             * adjusting the passed-in `GumboStringPiece` appropriately. The tag
         | 
| 311 | 
            +
             * name is in the original case and shares a buffer with the original
         | 
| 312 | 
            +
             * text, to simplify memory management. Behavior is undefined if a
         | 
| 313 | 
            +
             * string piece that doesn't represent an HTML tag (`<tagname>` or
         | 
| 314 | 
            +
             * `</tagname>`) is passed in. If the string piece is completely
         | 
| 315 | 
            +
             * empty (`NULL` data pointer), then this function will exit
         | 
| 316 | 
            +
             * successfully as a no-op.
         | 
| 183 317 | 
             
             */
         | 
| 184 318 | 
             
            void gumbo_tag_from_original_text(GumboStringPiece* text);
         | 
| 185 319 |  | 
| 186 320 | 
             
            /**
         | 
| 187 | 
            -
             * Fixes the case of SVG elements that are not all lowercase.
         | 
| 188 | 
            -
             *  | 
| 189 | 
            -
             *  | 
| 190 | 
            -
             *  | 
| 191 | 
            -
             *  | 
| 192 | 
            -
             *  | 
| 193 | 
            -
             *  | 
| 194 | 
            -
             *  | 
| 195 | 
            -
             *  | 
| 196 | 
            -
             * | 
| 321 | 
            +
             * Fixes the case of SVG elements that are not all lowercase. This is
         | 
| 322 | 
            +
             * not done at parse time because there's no place to store a mutated
         | 
| 323 | 
            +
             * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
         | 
| 324 | 
            +
             * SVG tags without special handling), while `original_tag_name` is a
         | 
| 325 | 
            +
             * pointer into the original buffer. Instead, we provide this helper
         | 
| 326 | 
            +
             * function that clients can use to rename SVG tags as appropriate.
         | 
| 327 | 
            +
             * Returns the case-normalized SVG tagname if a replacement is found, or
         | 
| 328 | 
            +
             * `NULL` if no normalization is called for. The return value is static
         | 
| 329 | 
            +
             * data and owned by the library.
         | 
| 330 | 
            +
             *
         | 
| 331 | 
            +
             * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
         | 
| 197 332 | 
             
             */
         | 
| 198 333 | 
             
            const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
         | 
| 199 334 |  | 
| 200 335 | 
             
            /**
         | 
| 201 | 
            -
             * Converts a tag name string (which may be in upper or mixed case) to a | 
| 202 | 
            -
             * enum. | 
| 336 | 
            +
             * Converts a tag name string (which may be in upper or mixed case) to a
         | 
| 337 | 
            +
             * tag enum.
         | 
| 203 338 | 
             
             */
         | 
| 204 | 
            -
            GumboTag  | 
| 205 | 
            -
            GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
         | 
| 339 | 
            +
            GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
         | 
| 206 340 |  | 
| 207 341 | 
             
            /**
         | 
| 208 342 | 
             
             * Attribute namespaces.
         | 
| 209 | 
            -
             * HTML includes special handling for XLink, XML, and XMLNS namespaces | 
| 210 | 
            -
             * attributes. | 
| 343 | 
            +
             * HTML includes special handling for XLink, XML, and XMLNS namespaces
         | 
| 344 | 
            +
             * on attributes. Everything else goes in the generic "NONE" namespace.
         | 
| 211 345 | 
             
             */
         | 
| 212 346 | 
             
            typedef enum {
         | 
| 213 347 | 
             
              GUMBO_ATTR_NAMESPACE_NONE,
         | 
| @@ -217,46 +351,47 @@ typedef enum { | |
| 217 351 | 
             
            } GumboAttributeNamespaceEnum;
         | 
| 218 352 |  | 
| 219 353 | 
             
            /**
         | 
| 220 | 
            -
             * A struct representing a single attribute on  | 
| 221 | 
            -
             * name-value pair, but also includes information about source locations | 
| 222 | 
            -
             * original source text.
         | 
| 354 | 
            +
             * A struct representing a single attribute on a HTML tag. This is a
         | 
| 355 | 
            +
             * name-value pair, but also includes information about source locations
         | 
| 356 | 
            +
             * and original source text.
         | 
| 223 357 | 
             
             */
         | 
| 224 358 | 
             
            typedef struct {
         | 
| 225 359 | 
             
              /**
         | 
| 226 | 
            -
               * The namespace for the attribute. | 
| 227 | 
            -
               * GUMBO_ATTR_NAMESPACE_NONE | 
| 228 | 
            -
               * values, per:
         | 
| 229 | 
            -
               *  | 
| 360 | 
            +
               * The namespace for the attribute. This will usually be
         | 
| 361 | 
            +
               * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
         | 
| 362 | 
            +
               * take special values, per:
         | 
| 363 | 
            +
               * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
         | 
| 230 364 | 
             
               */
         | 
| 231 365 | 
             
              GumboAttributeNamespaceEnum attr_namespace;
         | 
| 232 366 |  | 
| 233 367 | 
             
              /**
         | 
| 234 | 
            -
               * The name of the attribute. | 
| 235 | 
            -
               * with case-normalization | 
| 368 | 
            +
               * The name of the attribute. This is in a freshly-allocated buffer to
         | 
| 369 | 
            +
               * deal with case-normalization and is null-terminated.
         | 
| 236 370 | 
             
               */
         | 
| 237 371 | 
             
              const char* name;
         | 
| 238 372 |  | 
| 239 373 | 
             
              /**
         | 
| 240 | 
            -
               * The original text of the attribute name, as a pointer into the | 
| 241 | 
            -
               * source buffer.
         | 
| 374 | 
            +
               * The original text of the attribute name, as a pointer into the
         | 
| 375 | 
            +
               * original source buffer.
         | 
| 242 376 | 
             
               */
         | 
| 243 377 | 
             
              GumboStringPiece original_name;
         | 
| 244 378 |  | 
| 245 379 | 
             
              /**
         | 
| 246 | 
            -
               * The value of the attribute. | 
| 247 | 
            -
               * with unescaping | 
| 248 | 
            -
               * that surround the attribute. | 
| 249 | 
            -
               *  | 
| 380 | 
            +
               * The value of the attribute. This is in a freshly-allocated buffer
         | 
| 381 | 
            +
               * to deal with unescaping and is null-terminated. It does not include
         | 
| 382 | 
            +
               * any quotes that surround the attribute. If the attribute has no
         | 
| 383 | 
            +
               * value (for example, `selected` on a checkbox) this will be an empty
         | 
| 384 | 
            +
               * string.
         | 
| 250 385 | 
             
               */
         | 
| 251 386 | 
             
              const char* value;
         | 
| 252 387 |  | 
| 253 388 | 
             
              /**
         | 
| 254 | 
            -
               * The original text of the value of the attribute. | 
| 255 | 
            -
               * original source buffer. | 
| 256 | 
            -
               * attribute | 
| 257 | 
            -
               * original_value.data[original_value.length - 1] to determine what | 
| 258 | 
            -
               * characters were. | 
| 259 | 
            -
               * string.
         | 
| 389 | 
            +
               * The original text of the value of the attribute. This points into
         | 
| 390 | 
            +
               * the original source buffer. It includes any quotes that surround
         | 
| 391 | 
            +
               * the attribute and you can look at `original_value.data[0]` and
         | 
| 392 | 
            +
               * `original_value.data[original_value.length - 1]` to determine what
         | 
| 393 | 
            +
               * the quote characters were. If the attribute has no value this will
         | 
| 394 | 
            +
               * be a 0-length string.
         | 
| 260 395 | 
             
               */
         | 
| 261 396 | 
             
              GumboStringPiece original_value;
         | 
| 262 397 |  | 
| @@ -264,9 +399,9 @@ typedef struct { | |
| 264 399 | 
             
              GumboSourcePosition name_start;
         | 
| 265 400 |  | 
| 266 401 | 
             
              /**
         | 
| 267 | 
            -
               * The ending position of the attribute name. | 
| 402 | 
            +
               * The ending position of the attribute name. This is not always derivable
         | 
| 268 403 | 
             
               * from the starting position of the value because of the possibility of
         | 
| 269 | 
            -
               * whitespace around the  | 
| 404 | 
            +
               * whitespace around the `=` sign.
         | 
| 270 405 | 
             
               */
         | 
| 271 406 | 
             
              GumboSourcePosition name_end;
         | 
| 272 407 |  | 
| @@ -278,34 +413,37 @@ typedef struct { | |
| 278 413 | 
             
            } GumboAttribute;
         | 
| 279 414 |  | 
| 280 415 | 
             
            /**
         | 
| 281 | 
            -
             * Given a vector of  | 
| 282 | 
            -
             * and return it, or NULL if no such attribute exists. | 
| 283 | 
            -
             * case-insensitive match, as HTML is case-insensitive.
         | 
| 416 | 
            +
             * Given a vector of `GumboAttribute`s, look up the one with the
         | 
| 417 | 
            +
             * specified name and return it, or `NULL` if no such attribute exists.
         | 
| 418 | 
            +
             * This uses a case-insensitive match, as HTML is case-insensitive.
         | 
| 284 419 | 
             
             */
         | 
| 285 420 | 
             
            GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
         | 
| 286 421 |  | 
| 287 422 | 
             
            /**
         | 
| 288 | 
            -
             * Enum denoting the type of node. | 
| 289 | 
            -
             * union.
         | 
| 423 | 
            +
             * Enum denoting the type of node. This determines the type of the
         | 
| 424 | 
            +
             * `node.v` union.
         | 
| 290 425 | 
             
             */
         | 
| 291 426 | 
             
            typedef enum {
         | 
| 292 | 
            -
              /** Document node. | 
| 427 | 
            +
              /** Document node. `v` will be a `GumboDocument`. */
         | 
| 293 428 | 
             
              GUMBO_NODE_DOCUMENT,
         | 
| 294 | 
            -
              /** Element node. | 
| 429 | 
            +
              /** Element node. `v` will be a `GumboElement`. */
         | 
| 295 430 | 
             
              GUMBO_NODE_ELEMENT,
         | 
| 296 | 
            -
              /** Text node. | 
| 431 | 
            +
              /** Text node. `v` will be a `GumboText`. */
         | 
| 297 432 | 
             
              GUMBO_NODE_TEXT,
         | 
| 298 | 
            -
              /** CDATA node. v will be a GumboText | 
| 433 | 
            +
              /** CDATA node. `v` will be a `GumboText`. */
         | 
| 299 434 | 
             
              GUMBO_NODE_CDATA,
         | 
| 300 | 
            -
              /** Comment node. | 
| 435 | 
            +
              /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
         | 
| 301 436 | 
             
              GUMBO_NODE_COMMENT,
         | 
| 302 | 
            -
              /** Text node, where all contents is whitespace. | 
| 437 | 
            +
              /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
         | 
| 303 438 | 
             
              GUMBO_NODE_WHITESPACE,
         | 
| 304 | 
            -
              /** | 
| 305 | 
            -
               *  | 
| 306 | 
            -
               *  | 
| 307 | 
            -
               *  | 
| 308 | 
            -
               *  | 
| 439 | 
            +
              /**
         | 
| 440 | 
            +
               * Template node. This is separate from `GUMBO_NODE_ELEMENT` because
         | 
| 441 | 
            +
               * many client libraries will want to ignore the contents of template
         | 
| 442 | 
            +
               * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
         | 
| 443 | 
            +
               * do the right thing here, while clients that want to include template
         | 
| 444 | 
            +
               * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
         | 
| 445 | 
            +
               * `GumboElement`.
         | 
| 446 | 
            +
               */
         | 
| 309 447 | 
             
              GUMBO_NODE_TEMPLATE
         | 
| 310 448 | 
             
            } GumboNodeType;
         | 
| 311 449 |  | 
| @@ -315,9 +453,7 @@ typedef enum { | |
| 315 453 | 
             
             */
         | 
| 316 454 | 
             
            typedef struct GumboInternalNode GumboNode;
         | 
| 317 455 |  | 
| 318 | 
            -
            /**
         | 
| 319 | 
            -
             * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
         | 
| 320 | 
            -
             */
         | 
| 456 | 
            +
            /** https://dom.spec.whatwg.org/#concept-document-quirks */
         | 
| 321 457 | 
             
            typedef enum {
         | 
| 322 458 | 
             
              GUMBO_DOCTYPE_NO_QUIRKS,
         | 
| 323 459 | 
             
              GUMBO_DOCTYPE_QUIRKS,
         | 
| @@ -326,10 +462,11 @@ typedef enum { | |
| 326 462 |  | 
| 327 463 | 
             
            /**
         | 
| 328 464 | 
             
             * Namespaces.
         | 
| 329 | 
            -
             * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. | 
| 330 | 
            -
             * anything inside an  | 
| 331 | 
            -
             *  | 
| 332 | 
            -
             *  | 
| 465 | 
            +
             * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
         | 
| 466 | 
            +
             * Rather, anything inside an `<svg>` tag is in the SVG namespace,
         | 
| 467 | 
            +
             * anything inside the `<math>` tag is in the MathML namespace, and
         | 
| 468 | 
            +
             * anything else is inside the HTML namespace. No other namespaces are
         | 
| 469 | 
            +
             * supported, so this can be an `enum`.
         | 
| 333 470 | 
             
             */
         | 
| 334 471 | 
             
            typedef enum {
         | 
| 335 472 | 
             
              GUMBO_NAMESPACE_HTML,
         | 
| @@ -339,66 +476,70 @@ typedef enum { | |
| 339 476 |  | 
| 340 477 | 
             
            /**
         | 
| 341 478 | 
             
             * Parse flags.
         | 
| 342 | 
            -
             * We track the reasons for parser insertion of nodes and store them in | 
| 343 | 
            -
             * bitvector in the node itself. | 
| 344 | 
            -
             * are implied by the HTML structure of the document, or flag | 
| 345 | 
            -
             * may not be allowed by a style guide, or track the | 
| 346 | 
            -
             * tricky HTML code.
         | 
| 479 | 
            +
             * We track the reasons for parser insertion of nodes and store them in
         | 
| 480 | 
            +
             * a bitvector in the node itself. This lets client code optimize out
         | 
| 481 | 
            +
             * nodes that are implied by the HTML structure of the document, or flag
         | 
| 482 | 
            +
             * constructs that may not be allowed by a style guide, or track the
         | 
| 483 | 
            +
             * prevalence of incorrect or tricky HTML code.
         | 
| 347 484 | 
             
             */
         | 
| 348 485 | 
             
            typedef enum {
         | 
| 349 486 | 
             
              /**
         | 
| 350 | 
            -
               * A normal node  | 
| 351 | 
            -
               * been reparented.
         | 
| 487 | 
            +
               * A normal node -- both start and end tags appear in the source,
         | 
| 488 | 
            +
               * nothing has been reparented.
         | 
| 352 489 | 
             
               */
         | 
| 353 490 | 
             
              GUMBO_INSERTION_NORMAL = 0,
         | 
| 354 491 |  | 
| 355 492 | 
             
              /**
         | 
| 356 | 
            -
               * A node inserted by the parser to fulfill some implicit insertion | 
| 357 | 
            -
               * This is usually set in addition to some other flag giving a | 
| 358 | 
            -
               * insertion reason; it's a generic catch-all term | 
| 359 | 
            -
               * this node did not appear in the document | 
| 493 | 
            +
               * A node inserted by the parser to fulfill some implicit insertion
         | 
| 494 | 
            +
               * rule. This is usually set in addition to some other flag giving a
         | 
| 495 | 
            +
               * more specific insertion reason; it's a generic catch-all term
         | 
| 496 | 
            +
               * meaning "The start tag for this node did not appear in the document
         | 
| 497 | 
            +
               * source".
         | 
| 360 498 | 
             
               */
         | 
| 361 499 | 
             
              GUMBO_INSERTION_BY_PARSER = 1 << 0,
         | 
| 362 500 |  | 
| 363 501 | 
             
              /**
         | 
| 364 | 
            -
               * A flag indicating that the end tag for this node did not appear in | 
| 365 | 
            -
               * document source. | 
| 366 | 
            -
               * parser-inserted nodes with an explicit end tag | 
| 367 | 
            -
               * has GUMBO_INSERTED_BY_PARSER set on the  | 
| 368 | 
            -
               * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the | 
| 369 | 
            -
               *  | 
| 370 | 
            -
               * | 
| 371 | 
            -
               *  | 
| 372 | 
            -
               *  | 
| 502 | 
            +
               * A flag indicating that the end tag for this node did not appear in
         | 
| 503 | 
            +
               * the document source. Note that in some cases, you can still have
         | 
| 504 | 
            +
               * parser-inserted nodes with an explicit end tag. For example,
         | 
| 505 | 
            +
               * `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
         | 
| 506 | 
            +
               * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
         | 
| 507 | 
            +
               * `</html>` tag actually exists.
         | 
| 508 | 
            +
               *
         | 
| 509 | 
            +
               * This flag will be set only if the end tag is completely missing.
         | 
| 510 | 
            +
               * In some cases, the end tag may be misplaced (e.g. a `</body>` tag
         | 
| 511 | 
            +
               * with text afterwards), which will leave this flag unset and require
         | 
| 512 | 
            +
               * clients to inspect the parse errors for that case.
         | 
| 373 513 | 
             
               */
         | 
| 374 514 | 
             
              GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
         | 
| 375 515 |  | 
| 376 516 | 
             
              // Value 1 << 2 was for a flag that has since been removed.
         | 
| 377 517 |  | 
| 378 518 | 
             
              /**
         | 
| 379 | 
            -
               * A flag for nodes that are inserted because their presence is | 
| 380 | 
            -
               * other tags,  | 
| 519 | 
            +
               * A flag for nodes that are inserted because their presence is
         | 
| 520 | 
            +
               * implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
         | 
| 521 | 
            +
               * `<tbody>`, etc.
         | 
| 381 522 | 
             
               */
         | 
| 382 523 | 
             
              GUMBO_INSERTION_IMPLIED = 1 << 3,
         | 
| 383 524 |  | 
| 384 525 | 
             
              /**
         | 
| 385 | 
            -
               * A flag for nodes that are converted from their end tag equivalents. | 
| 386 | 
            -
               * example,  | 
| 387 | 
            -
               * create a  | 
| 388 | 
            -
               * as  | 
| 526 | 
            +
               * A flag for nodes that are converted from their end tag equivalents.
         | 
| 527 | 
            +
               * For example, `</p>` when no paragraph is open implies that the
         | 
| 528 | 
            +
               * parser should create a `<p>` tag and immediately close it, while
         | 
| 529 | 
            +
               * `</br>` means the same thing as `<br>`.
         | 
| 389 530 | 
             
               */
         | 
| 390 531 | 
             
              GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
         | 
| 391 532 |  | 
| 392 | 
            -
               | 
| 393 | 
            -
              GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
         | 
| 533 | 
            +
              // Value 1 << 5 was for a flag that has since been removed.
         | 
| 394 534 |  | 
| 395 | 
            -
              /** A flag for  | 
| 535 | 
            +
              /** A flag for `<image>` tags that are rewritten as `<img>`. */
         | 
| 396 536 | 
             
              GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
         | 
| 397 537 |  | 
| 398 538 | 
             
              /**
         | 
| 399 | 
            -
               * A flag for nodes that are cloned as a result of the reconstruction | 
| 400 | 
            -
               * active formatting elements. | 
| 401 | 
            -
               * portion of the formatting run is a NORMAL node with an | 
| 539 | 
            +
               * A flag for nodes that are cloned as a result of the reconstruction
         | 
| 540 | 
            +
               * of active formatting elements. This is set only on the clone; the
         | 
| 541 | 
            +
               * initial portion of the formatting run is a NORMAL node with an
         | 
| 542 | 
            +
               * `IMPLICIT_END_TAG`.
         | 
| 402 543 | 
             
               */
         | 
| 403 544 | 
             
              GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
         | 
| 404 545 |  | 
| @@ -415,18 +556,19 @@ typedef enum { | |
| 415 556 | 
             
              GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
         | 
| 416 557 | 
             
            } GumboParseFlags;
         | 
| 417 558 |  | 
| 418 | 
            -
            /**
         | 
| 419 | 
            -
             * Information specific to document nodes.
         | 
| 420 | 
            -
             */
         | 
| 559 | 
            +
            /** Information specific to document nodes. */
         | 
| 421 560 | 
             
            typedef struct {
         | 
| 422 561 | 
             
              /**
         | 
| 423 | 
            -
               * An array of  | 
| 424 | 
            -
               * normally consist of the  | 
| 425 | 
            -
               * Pointers are owned.
         | 
| 562 | 
            +
               * An array of `GumboNode`s, containing the children of this element.
         | 
| 563 | 
            +
               * This will normally consist of the `<html>` element and any comment
         | 
| 564 | 
            +
               * nodes found. Pointers are owned.
         | 
| 426 565 | 
             
               */
         | 
| 427 566 | 
             
              GumboVector /* GumboNode* */ children;
         | 
| 428 567 |  | 
| 429 | 
            -
               | 
| 568 | 
            +
              /**
         | 
| 569 | 
            +
               * `true` if there was an explicit doctype token, as opposed to it
         | 
| 570 | 
            +
               * being omitted.
         | 
| 571 | 
            +
               */
         | 
| 430 572 | 
             
              bool has_doctype;
         | 
| 431 573 |  | 
| 432 574 | 
             
              // Fields from the doctype token, copied verbatim.
         | 
| @@ -435,65 +577,70 @@ typedef struct { | |
| 435 577 | 
             
              const char* system_identifier;
         | 
| 436 578 |  | 
| 437 579 | 
             
              /**
         | 
| 438 | 
            -
               * Whether or not the document is in QuirksMode, as determined by the | 
| 439 | 
            -
               * in the GumboTokenDocType template.
         | 
| 580 | 
            +
               * Whether or not the document is in QuirksMode, as determined by the
         | 
| 581 | 
            +
               * values in the GumboTokenDocType template.
         | 
| 440 582 | 
             
               */
         | 
| 441 583 | 
             
              GumboQuirksModeEnum doc_type_quirks_mode;
         | 
| 442 584 | 
             
            } GumboDocument;
         | 
| 443 585 |  | 
| 444 586 | 
             
            /**
         | 
| 445 | 
            -
             * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE | 
| 446 | 
            -
             * This contains just a block of text and its position.
         | 
| 587 | 
            +
             * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
         | 
| 588 | 
            +
             * elements. This contains just a block of text and its position.
         | 
| 447 589 | 
             
             */
         | 
| 448 590 | 
             
            typedef struct {
         | 
| 449 591 | 
             
              /**
         | 
| 450 | 
            -
               * The text of this node, after entities have been parsed and decoded. | 
| 451 | 
            -
               * comment | 
| 592 | 
            +
               * The text of this node, after entities have been parsed and decoded.
         | 
| 593 | 
            +
               * For comment and cdata nodes, this does not include the comment
         | 
| 594 | 
            +
               * delimiters.
         | 
| 452 595 | 
             
               */
         | 
| 453 596 | 
             
              const char* text;
         | 
| 454 597 |  | 
| 455 598 | 
             
              /**
         | 
| 456 | 
            -
               * The original text of this node, as a pointer into the original | 
| 457 | 
            -
               * comment/cdata nodes, this includes the comment | 
| 599 | 
            +
               * The original text of this node, as a pointer into the original
         | 
| 600 | 
            +
               * buffer. For comment/cdata nodes, this includes the comment
         | 
| 601 | 
            +
               * delimiters.
         | 
| 458 602 | 
             
               */
         | 
| 459 603 | 
             
              GumboStringPiece original_text;
         | 
| 460 604 |  | 
| 461 605 | 
             
              /**
         | 
| 462 | 
            -
               * The starting position of this node. | 
| 463 | 
            -
               * original_text | 
| 606 | 
            +
               * The starting position of this node. This corresponds to the
         | 
| 607 | 
            +
               * position of `original_text`, before entities are decoded.
         | 
| 464 608 | 
             
               * */
         | 
| 465 609 | 
             
              GumboSourcePosition start_pos;
         | 
| 466 610 | 
             
            } GumboText;
         | 
| 467 611 |  | 
| 468 612 | 
             
            /**
         | 
| 469 | 
            -
             * The struct used to represent all HTML elements. | 
| 470 | 
            -
             * about the tag, attributes, and child nodes.
         | 
| 613 | 
            +
             * The struct used to represent all HTML elements. This contains
         | 
| 614 | 
            +
             * information about the tag, attributes, and child nodes.
         | 
| 471 615 | 
             
             */
         | 
| 472 616 | 
             
            typedef struct {
         | 
| 473 617 | 
             
              /**
         | 
| 474 | 
            -
               * An array of  | 
| 475 | 
            -
               * are owned.
         | 
| 618 | 
            +
               * An array of `GumboNode`s, containing the children of this element.
         | 
| 619 | 
            +
               * Pointers are owned.
         | 
| 476 620 | 
             
               */
         | 
| 477 621 | 
             
              GumboVector /* GumboNode* */ children;
         | 
| 478 622 |  | 
| 479 623 | 
             
              /** The GumboTag enum for this element. */
         | 
| 480 624 | 
             
              GumboTag tag;
         | 
| 481 625 |  | 
| 626 | 
            +
              /** The name for this element. */
         | 
| 627 | 
            +
              const char* name;
         | 
| 628 | 
            +
             | 
| 482 629 | 
             
              /** The GumboNamespaceEnum for this element. */
         | 
| 483 630 | 
             
              GumboNamespaceEnum tag_namespace;
         | 
| 484 631 |  | 
| 485 632 | 
             
              /**
         | 
| 486 | 
            -
               * A GumboStringPiece pointing to the original tag text for this | 
| 487 | 
            -
               * pointing directly into the source buffer. | 
| 488 | 
            -
               * algorithmically (for example,  | 
| 489 | 
            -
               * zero-length string.
         | 
| 633 | 
            +
               * A `GumboStringPiece` pointing to the original tag text for this
         | 
| 634 | 
            +
               * element, pointing directly into the source buffer. If the tag was
         | 
| 635 | 
            +
               * inserted algorithmically (for example, `<head>` or `<tbody>`
         | 
| 636 | 
            +
               * insertion), this will be a zero-length string.
         | 
| 490 637 | 
             
               */
         | 
| 491 638 | 
             
              GumboStringPiece original_tag;
         | 
| 492 639 |  | 
| 493 640 | 
             
              /**
         | 
| 494 | 
            -
               * A GumboStringPiece pointing to the original end tag text for this | 
| 495 | 
            -
               * If the end tag was inserted algorithmically, (for example, | 
| 496 | 
            -
               * self-closing tag), this will be a zero-length string.
         | 
| 641 | 
            +
               * A `GumboStringPiece` pointing to the original end tag text for this
         | 
| 642 | 
            +
               * element. If the end tag was inserted algorithmically, (for example,
         | 
| 643 | 
            +
               * closing a self-closing tag), this will be a zero-length string.
         | 
| 497 644 | 
             
               */
         | 
| 498 645 | 
             
              GumboStringPiece original_end_tag;
         | 
| 499 646 |  | 
| @@ -504,30 +651,31 @@ typedef struct { | |
| 504 651 | 
             
              GumboSourcePosition end_pos;
         | 
| 505 652 |  | 
| 506 653 | 
             
              /**
         | 
| 507 | 
            -
               * An array of  | 
| 508 | 
            -
               * order that they were parsed. | 
| 654 | 
            +
               * An array of `GumboAttribute`s, containing the attributes for this
         | 
| 655 | 
            +
               * tag in the order that they were parsed. Pointers are owned.
         | 
| 509 656 | 
             
               */
         | 
| 510 657 | 
             
              GumboVector /* GumboAttribute* */ attributes;
         | 
| 511 658 | 
             
            } GumboElement;
         | 
| 512 659 |  | 
| 513 660 | 
             
            /**
         | 
| 514 | 
            -
             * A supertype for GumboElement and GumboText | 
| 515 | 
            -
             * generic type in lists of children and cast as necessary | 
| 661 | 
            +
             * A supertype for `GumboElement` and `GumboText`, so that we can
         | 
| 662 | 
            +
             * include one generic type in lists of children and cast as necessary
         | 
| 663 | 
            +
             * to subtypes.
         | 
| 516 664 | 
             
             */
         | 
| 517 665 | 
             
            struct GumboInternalNode {
         | 
| 518 666 | 
             
              /** The type of node that this is. */
         | 
| 519 667 | 
             
              GumboNodeType type;
         | 
| 520 668 |  | 
| 521 | 
            -
              /** Pointer back to parent node. | 
| 669 | 
            +
              /** Pointer back to parent node. Not owned. */
         | 
| 522 670 | 
             
              GumboNode* parent;
         | 
| 523 671 |  | 
| 524 672 | 
             
              /** The index within the parent's children vector of this node. */
         | 
| 525 | 
            -
               | 
| 673 | 
            +
              unsigned int index_within_parent;
         | 
| 526 674 |  | 
| 527 675 | 
             
              /**
         | 
| 528 | 
            -
               * A bitvector of flags containing information about why this element | 
| 529 | 
            -
               * inserted into the parse tree, including a variety of special | 
| 530 | 
            -
               * situations.
         | 
| 676 | 
            +
               * A bitvector of flags containing information about why this element
         | 
| 677 | 
            +
               * was inserted into the parse tree, including a variety of special
         | 
| 678 | 
            +
               * parse situations.
         | 
| 531 679 | 
             
               */
         | 
| 532 680 | 
             
              GumboParseFlags parse_flags;
         | 
| 533 681 |  | 
| @@ -539,133 +687,257 @@ struct GumboInternalNode { | |
| 539 687 | 
             
              } v;
         | 
| 540 688 | 
             
            };
         | 
| 541 689 |  | 
| 542 | 
            -
            /**
         | 
| 543 | 
            -
             * The type for an allocator function.  Takes the 'userdata' member of the
         | 
| 544 | 
            -
             * GumboParser struct as its first argument.  Semantics should be the same as
         | 
| 545 | 
            -
             * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
         | 
| 546 | 
            -
             * Allocating a block of 0 bytes behaves as per malloc.
         | 
| 547 | 
            -
             */
         | 
| 548 | 
            -
            // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
         | 
| 549 | 
            -
            typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
         | 
| 550 | 
            -
             | 
| 551 | 
            -
            /**
         | 
| 552 | 
            -
             * The type for a deallocator function.  Takes the 'userdata' member of the
         | 
| 553 | 
            -
             * GumboParser struct as its first argument.
         | 
| 554 | 
            -
             */
         | 
| 555 | 
            -
            typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
         | 
| 556 | 
            -
             | 
| 557 690 | 
             
            /**
         | 
| 558 691 | 
             
             * Input struct containing configuration options for the parser.
         | 
| 559 | 
            -
             * These let you specify alternate memory managers, provide different | 
| 560 | 
            -
             * handling, etc.
         | 
| 561 | 
            -
             *  | 
| 692 | 
            +
             * These let you specify alternate memory managers, provide different
         | 
| 693 | 
            +
             * error handling, etc. Use `kGumboDefaultOptions` for sensible
         | 
| 694 | 
            +
             * defaults and only set what you need.
         | 
| 562 695 | 
             
             */
         | 
| 563 696 | 
             
            typedef struct GumboInternalOptions {
         | 
| 564 | 
            -
              /** | 
| 565 | 
            -
             | 
| 566 | 
            -
             | 
| 567 | 
            -
             | 
| 568 | 
            -
               | 
| 697 | 
            +
              /**
         | 
| 698 | 
            +
               * The tab-stop size, for computing positions in HTML files that
         | 
| 699 | 
            +
               * use tabs. Default: `8`.
         | 
| 700 | 
            +
               */
         | 
| 701 | 
            +
              int tab_stop;
         | 
| 569 702 |  | 
| 570 703 | 
             
              /**
         | 
| 571 | 
            -
               *  | 
| 572 | 
            -
               *  | 
| 704 | 
            +
               * Whether or not to stop parsing when the first error is encountered.
         | 
| 705 | 
            +
               * Default: `false`.
         | 
| 573 706 | 
             
               */
         | 
| 574 | 
            -
               | 
| 707 | 
            +
              bool stop_on_first_error;
         | 
| 575 708 |  | 
| 576 709 | 
             
              /**
         | 
| 577 | 
            -
               *  | 
| 578 | 
            -
               *  | 
| 710 | 
            +
               * Maximum allowed number of attributes per element. If this limit is
         | 
| 711 | 
            +
               * exceeded, the parser will return early with a partial document and
         | 
| 712 | 
            +
               * the returned `GumboOutput` will have its `status` field set to
         | 
| 713 | 
            +
               * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
         | 
| 714 | 
            +
               * Default: `400`.
         | 
| 579 715 | 
             
               */
         | 
| 580 | 
            -
              int  | 
| 716 | 
            +
              int max_attributes;
         | 
| 581 717 |  | 
| 582 718 | 
             
              /**
         | 
| 583 | 
            -
               *  | 
| 584 | 
            -
               *  | 
| 719 | 
            +
               * Maximum allowed depth for the parse tree. If this limit is exceeded,
         | 
| 720 | 
            +
               * the parser will return early with a partial document and the returned
         | 
| 721 | 
            +
               * `GumboOutput` will have its `status` field set to
         | 
| 722 | 
            +
               * `GUMBO_STATUS_TREE_TOO_DEEP`.
         | 
| 723 | 
            +
               * Default: `400`.
         | 
| 585 724 | 
             
               */
         | 
| 586 | 
            -
               | 
| 725 | 
            +
              unsigned int max_tree_depth;
         | 
| 587 726 |  | 
| 588 727 | 
             
              /**
         | 
| 589 | 
            -
               * The maximum number of errors before the parser stops recording | 
| 590 | 
            -
               * is provided so that if the page is totally borked, we | 
| 591 | 
            -
               * up the errors vector and exhaust memory with | 
| 592 | 
            -
               * to  | 
| 593 | 
            -
               * Default:  | 
| 728 | 
            +
               * The maximum number of errors before the parser stops recording
         | 
| 729 | 
            +
               * them. This is provided so that if the page is totally borked, we
         | 
| 730 | 
            +
               * don't completely fill up the errors vector and exhaust memory with
         | 
| 731 | 
            +
               * useless redundant errors. Set to `-1` to disable the limit.
         | 
| 732 | 
            +
               * Default: `-1`.
         | 
| 594 733 | 
             
               */
         | 
| 595 734 | 
             
              int max_errors;
         | 
| 596 735 |  | 
| 597 736 | 
             
              /**
         | 
| 598 737 | 
             
               * The fragment context for parsing:
         | 
| 599 | 
            -
               * https://html.spec.whatwg.org/multipage/ | 
| 738 | 
            +
               * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
         | 
| 600 739 | 
             
               *
         | 
| 601 | 
            -
               * If  | 
| 602 | 
            -
               * the regular parsing algorithm. | 
| 603 | 
            -
               * intended parent of the parsed fragment. | 
| 604 | 
            -
               *  | 
| 605 | 
            -
               *  | 
| 606 | 
            -
               * if parsing a fragment even when a full HTML tree isn't available.
         | 
| 740 | 
            +
               * If `NULL` is passed here, it is assumed to be "no
         | 
| 741 | 
            +
               * fragment", i.e. the regular parsing algorithm. Otherwise, pass the
         | 
| 742 | 
            +
               * tag name for the intended parent of the parsed fragment. We use the
         | 
| 743 | 
            +
               * tag name, namespace, and encoding attribute which are sufficient to
         | 
| 744 | 
            +
               * set all of the parsing context needed for fragment parsing.
         | 
| 607 745 | 
             
               *
         | 
| 608 | 
            -
               * Default:  | 
| 746 | 
            +
               * Default: `NULL`.
         | 
| 609 747 | 
             
               */
         | 
| 610 | 
            -
               | 
| 748 | 
            +
              const char* fragment_context;
         | 
| 611 749 |  | 
| 612 750 | 
             
              /**
         | 
| 613 | 
            -
               * The namespace for the fragment context. | 
| 614 | 
            -
               * differentiate between, say, parsing a  | 
| 615 | 
            -
               * HTML.
         | 
| 616 | 
            -
               * | 
| 751 | 
            +
               * The namespace for the fragment context. This lets client code
         | 
| 752 | 
            +
               * differentiate between, say, parsing a `<title>` tag in SVG vs.
         | 
| 753 | 
            +
               * parsing it in HTML.
         | 
| 754 | 
            +
               *
         | 
| 755 | 
            +
               * Default: `GUMBO_NAMESPACE_HTML`.
         | 
| 617 756 | 
             
               */
         | 
| 618 757 | 
             
              GumboNamespaceEnum fragment_namespace;
         | 
| 758 | 
            +
             | 
| 759 | 
            +
              /**
         | 
| 760 | 
            +
               * The value of the fragment context's `encoding` attribute, if any.
         | 
| 761 | 
            +
               * Set to `NULL` for no `encoding` attribute.
         | 
| 762 | 
            +
               *
         | 
| 763 | 
            +
               * Default: `NULL`.
         | 
| 764 | 
            +
               */
         | 
| 765 | 
            +
              const char* fragment_encoding;
         | 
| 766 | 
            +
             | 
| 767 | 
            +
              /**
         | 
| 768 | 
            +
               * Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
         | 
| 769 | 
            +
               * be looked up using `gumbo_compute_quirks_mode()`.
         | 
| 770 | 
            +
               *
         | 
| 771 | 
            +
               * Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
         | 
| 772 | 
            +
               */
         | 
| 773 | 
            +
              GumboQuirksModeEnum quirks_mode;
         | 
| 774 | 
            +
             | 
| 775 | 
            +
              /**
         | 
| 776 | 
            +
               * For fragment parsing. Set this to true if the context node has a form
         | 
| 777 | 
            +
               * element as an ancestor.
         | 
| 778 | 
            +
               *
         | 
| 779 | 
            +
               * Default: `false`.
         | 
| 780 | 
            +
               */
         | 
| 781 | 
            +
              bool fragment_context_has_form_ancestor;
         | 
| 619 782 | 
             
            } GumboOptions;
         | 
| 620 783 |  | 
| 621 784 | 
             
            /** Default options struct; use this with gumbo_parse_with_options. */
         | 
| 622 785 | 
             
            extern const GumboOptions kGumboDefaultOptions;
         | 
| 623 786 |  | 
| 787 | 
            +
            /**
         | 
| 788 | 
            +
             * Status code indicating whether parsing finished successfully or
         | 
| 789 | 
            +
             * was stopped mid-document due to exceptional circumstances.
         | 
| 790 | 
            +
             */
         | 
| 791 | 
            +
            typedef enum {
         | 
| 792 | 
            +
              /**
         | 
| 793 | 
            +
               * Indicates that parsing completed successfuly. The resulting tree
         | 
| 794 | 
            +
               * will be a complete document.
         | 
| 795 | 
            +
               */
         | 
| 796 | 
            +
              GUMBO_STATUS_OK,
         | 
| 797 | 
            +
             | 
| 798 | 
            +
              /**
         | 
| 799 | 
            +
               * Indicates that the maximum element nesting limit
         | 
| 800 | 
            +
               * (`GumboOptions::max_tree_depth`) was reached during parsing. The
         | 
| 801 | 
            +
               * resulting tree will be a partial document, with no further nodes
         | 
| 802 | 
            +
               * created after the point where the limit was reached. The partial
         | 
| 803 | 
            +
               * document may be useful for constructing an error message but
         | 
| 804 | 
            +
               * typically shouldn't be used for other purposes.
         | 
| 805 | 
            +
               */
         | 
| 806 | 
            +
              GUMBO_STATUS_TREE_TOO_DEEP,
         | 
| 807 | 
            +
             | 
| 808 | 
            +
              /**
         | 
| 809 | 
            +
               * Indicates that the maximum number of attributes per element
         | 
| 810 | 
            +
               * (`GumboOptions::max_attributes`) was reached during parsing. The
         | 
| 811 | 
            +
               * resulting tree will be a partial document, with no further nodes
         | 
| 812 | 
            +
               * created after the point where the limit was reached. The partial
         | 
| 813 | 
            +
               * document may be useful for constructing an error message but
         | 
| 814 | 
            +
               * typically shouldn't be used for other purposes.
         | 
| 815 | 
            +
               */
         | 
| 816 | 
            +
              GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
         | 
| 817 | 
            +
             | 
| 818 | 
            +
              // Currently unused
         | 
| 819 | 
            +
              GUMBO_STATUS_OUT_OF_MEMORY,
         | 
| 820 | 
            +
            } GumboOutputStatus;
         | 
| 821 | 
            +
             | 
| 822 | 
            +
             | 
| 624 823 | 
             
            /** The output struct containing the results of the parse. */
         | 
| 625 824 | 
             
            typedef struct GumboInternalOutput {
         | 
| 626 825 | 
             
              /**
         | 
| 627 | 
            -
               * Pointer to the document node. | 
| 628 | 
            -
               * that contains the entire document as its child.
         | 
| 826 | 
            +
               * Pointer to the document node. This is a `GumboNode` of type
         | 
| 827 | 
            +
               * `NODE_DOCUMENT` that contains the entire document as its child.
         | 
| 629 828 | 
             
               */
         | 
| 630 829 | 
             
              GumboNode* document;
         | 
| 631 830 |  | 
| 632 831 | 
             
              /**
         | 
| 633 | 
            -
               * Pointer to the root node. | 
| 634 | 
            -
               * document.
         | 
| 832 | 
            +
               * Pointer to the root node. This is the `<html>` tag that forms the
         | 
| 833 | 
            +
               * root of the document.
         | 
| 635 834 | 
             
               */
         | 
| 636 835 | 
             
              GumboNode* root;
         | 
| 637 836 |  | 
| 638 837 | 
             
              /**
         | 
| 639 838 | 
             
               * A list of errors that occurred during the parse.
         | 
| 640 | 
            -
               * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
         | 
| 641 | 
            -
               * fleshed out and may change in the future.  For this reason, the GumboError
         | 
| 642 | 
            -
               * header isn't part of the public API.  Contact us if you need errors
         | 
| 643 | 
            -
               * reported so we can work out something appropriate for your use-case.
         | 
| 644 839 | 
             
               */
         | 
| 645 840 | 
             
              GumboVector /* GumboError */ errors;
         | 
| 841 | 
            +
             | 
| 842 | 
            +
              /**
         | 
| 843 | 
            +
               * True if the parser encounted an error.
         | 
| 844 | 
            +
               *
         | 
| 845 | 
            +
               * This can be true and `errors` an empty `GumboVector` if the `max_errors`
         | 
| 846 | 
            +
               * option was set to 0.
         | 
| 847 | 
            +
               */
         | 
| 848 | 
            +
              bool document_error;
         | 
| 849 | 
            +
             | 
| 850 | 
            +
              /**
         | 
| 851 | 
            +
               * A status code indicating whether parsing finished successfully or was
         | 
| 852 | 
            +
               * stopped mid-document due to exceptional circumstances.
         | 
| 853 | 
            +
               */
         | 
| 854 | 
            +
              GumboOutputStatus status;
         | 
| 646 855 | 
             
            } GumboOutput;
         | 
| 647 856 |  | 
| 648 857 | 
             
            /**
         | 
| 649 | 
            -
             * Parses a buffer of  | 
| 650 | 
            -
             * live at least as long as the parse tree, as some fields | 
| 651 | 
            -
             * point directly into the original buffer.
         | 
| 858 | 
            +
             * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
         | 
| 859 | 
            +
             * buffer must live at least as long as the parse tree, as some fields
         | 
| 860 | 
            +
             * (eg. `original_text`) point directly into the original buffer.
         | 
| 652 861 | 
             
             *
         | 
| 653 862 | 
             
             * This doesn't support buffers longer than 4 gigabytes.
         | 
| 654 863 | 
             
             */
         | 
| 655 864 | 
             
            GumboOutput* gumbo_parse(const char* buffer);
         | 
| 656 865 |  | 
| 657 866 | 
             
            /**
         | 
| 658 | 
            -
             * Extended version of gumbo_parse that takes an explicit options | 
| 659 | 
            -
             * buffer, and length.
         | 
| 867 | 
            +
             * Extended version of `gumbo_parse` that takes an explicit options
         | 
| 868 | 
            +
             * structure, buffer, and length.
         | 
| 869 | 
            +
             */
         | 
| 870 | 
            +
            GumboOutput* gumbo_parse_with_options (
         | 
| 871 | 
            +
              const GumboOptions* options,
         | 
| 872 | 
            +
              const char* buffer,
         | 
| 873 | 
            +
              size_t buffer_length
         | 
| 874 | 
            +
            );
         | 
| 875 | 
            +
             | 
| 876 | 
            +
            /**
         | 
| 877 | 
            +
             * Compute the quirks mode based on the name, public identifier, and system
         | 
| 878 | 
            +
             * identifier. Any of these may be `NULL` to indicate a missing value.
         | 
| 879 | 
            +
             */
         | 
| 880 | 
            +
            GumboQuirksModeEnum gumbo_compute_quirks_mode (
         | 
| 881 | 
            +
              const char *name,
         | 
| 882 | 
            +
              const char *pubid,
         | 
| 883 | 
            +
              const char *sysid
         | 
| 884 | 
            +
            );
         | 
| 885 | 
            +
             | 
| 886 | 
            +
            /** Convert a `GumboOutputStatus` code into a readable description. */
         | 
| 887 | 
            +
            const char* gumbo_status_to_string(GumboOutputStatus status);
         | 
| 888 | 
            +
             | 
| 889 | 
            +
            /** Release the memory used for the parse tree and parse errors. */
         | 
| 890 | 
            +
            void gumbo_destroy_output(GumboOutput* output);
         | 
| 891 | 
            +
             | 
| 892 | 
            +
            /** Opaque GumboError type */
         | 
| 893 | 
            +
            typedef struct GumboInternalError GumboError;
         | 
| 894 | 
            +
             | 
| 895 | 
            +
            /**
         | 
| 896 | 
            +
             * Returns the position of the error.
         | 
| 660 897 | 
             
             */
         | 
| 661 | 
            -
             | 
| 662 | 
            -
                const GumboOptions* options, const char* buffer, size_t buffer_length);
         | 
| 898 | 
            +
            GumboSourcePosition gumbo_error_position(const GumboError* error);
         | 
| 663 899 |  | 
| 664 | 
            -
            /** | 
| 665 | 
            -
             | 
| 900 | 
            +
            /**
         | 
| 901 | 
            +
             * Returns a constant string representation of the error's code. This is owned
         | 
| 902 | 
            +
             * by the library and should not be freed by the caller.
         | 
| 903 | 
            +
             */
         | 
| 904 | 
            +
            const char* gumbo_error_code(const GumboError* error);
         | 
| 905 | 
            +
             | 
| 906 | 
            +
            /**
         | 
| 907 | 
            +
             * Prints an error to a string. This stores a freshly-allocated buffer
         | 
| 908 | 
            +
             * containing the error message text in output. The caller is responsible for
         | 
| 909 | 
            +
             * freeing the buffer. The size of the error message is returned. The error
         | 
| 910 | 
            +
             * message itself may not be NULL-terminated and may contain NULL bytes so the
         | 
| 911 | 
            +
             * returned size must be used.
         | 
| 912 | 
            +
             */
         | 
| 913 | 
            +
            size_t gumbo_error_to_string(const GumboError* error, char **output);
         | 
| 914 | 
            +
             | 
| 915 | 
            +
            /**
         | 
| 916 | 
            +
             * Prints a caret diagnostic to a string. This stores a freshly-allocated
         | 
| 917 | 
            +
             * buffer containing the error message text in output. The caller is responsible for
         | 
| 918 | 
            +
             * freeing the buffer. The size of the error message is returned. The error
         | 
| 919 | 
            +
             * message itself may not be NULL-terminated and may contain NULL bytes so the
         | 
| 920 | 
            +
             * returned size must be used.
         | 
| 921 | 
            +
             */
         | 
| 922 | 
            +
            size_t gumbo_caret_diagnostic_to_string (
         | 
| 923 | 
            +
              const GumboError* error,
         | 
| 924 | 
            +
              const char* source_text,
         | 
| 925 | 
            +
              size_t source_length,
         | 
| 926 | 
            +
              char** output
         | 
| 927 | 
            +
            );
         | 
| 928 | 
            +
             | 
| 929 | 
            +
            /**
         | 
| 930 | 
            +
             * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
         | 
| 931 | 
            +
             * instead of writing to a string.
         | 
| 932 | 
            +
             */
         | 
| 933 | 
            +
            void gumbo_print_caret_diagnostic (
         | 
| 934 | 
            +
              const GumboError* error,
         | 
| 935 | 
            +
              const char* source_text,
         | 
| 936 | 
            +
              size_t source_length
         | 
| 937 | 
            +
            );
         | 
| 666 938 |  | 
| 667 939 | 
             
            #ifdef __cplusplus
         | 
| 668 940 | 
             
            }
         | 
| 669 941 | 
             
            #endif
         | 
| 670 942 |  | 
| 671 | 
            -
            #endif | 
| 943 | 
            +
            #endif // GUMBO_H
         |