nokogumbo 1.5.0 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +237 -26
  3. data/ext/nokogumbo/extconf.rb +121 -0
  4. data/ext/nokogumbo/nokogumbo.c +793 -0
  5. data/gumbo-parser/src/ascii.c +75 -0
  6. data/gumbo-parser/src/ascii.h +115 -0
  7. data/gumbo-parser/src/attribute.c +26 -28
  8. data/gumbo-parser/src/attribute.h +3 -23
  9. data/gumbo-parser/src/char_ref.c +5972 -6816
  10. data/gumbo-parser/src/char_ref.h +14 -45
  11. data/gumbo-parser/src/error.c +510 -163
  12. data/gumbo-parser/src/error.h +70 -147
  13. data/gumbo-parser/src/foreign_attrs.c +104 -0
  14. data/gumbo-parser/src/gumbo.h +577 -305
  15. data/gumbo-parser/src/insertion_mode.h +4 -28
  16. data/gumbo-parser/src/macros.h +91 -0
  17. data/gumbo-parser/src/parser.c +2922 -2228
  18. data/gumbo-parser/src/parser.h +6 -22
  19. data/gumbo-parser/src/replacement.h +33 -0
  20. data/gumbo-parser/src/string_buffer.c +43 -50
  21. data/gumbo-parser/src/string_buffer.h +24 -40
  22. data/gumbo-parser/src/string_piece.c +39 -39
  23. data/gumbo-parser/src/svg_attrs.c +174 -0
  24. data/gumbo-parser/src/svg_tags.c +137 -0
  25. data/gumbo-parser/src/tag.c +186 -59
  26. data/gumbo-parser/src/tag_lookup.c +382 -0
  27. data/gumbo-parser/src/tag_lookup.h +13 -0
  28. data/gumbo-parser/src/token_buffer.c +79 -0
  29. data/gumbo-parser/src/token_buffer.h +71 -0
  30. data/gumbo-parser/src/token_type.h +1 -25
  31. data/gumbo-parser/src/tokenizer.c +2127 -1561
  32. data/gumbo-parser/src/tokenizer.h +41 -52
  33. data/gumbo-parser/src/tokenizer_states.h +281 -45
  34. data/gumbo-parser/src/utf8.c +98 -123
  35. data/gumbo-parser/src/utf8.h +84 -52
  36. data/gumbo-parser/src/util.c +48 -38
  37. data/gumbo-parser/src/util.h +10 -40
  38. data/gumbo-parser/src/vector.c +45 -57
  39. data/gumbo-parser/src/vector.h +17 -39
  40. data/lib/nokogumbo.rb +11 -173
  41. data/lib/nokogumbo/html5.rb +252 -0
  42. data/lib/nokogumbo/html5/document.rb +53 -0
  43. data/lib/nokogumbo/html5/document_fragment.rb +62 -0
  44. data/lib/nokogumbo/html5/node.rb +72 -0
  45. data/lib/nokogumbo/version.rb +3 -0
  46. metadata +43 -24
  47. data/ext/nokogumboc/extconf.rb +0 -60
  48. data/ext/nokogumboc/nokogumbo.c +0 -295
  49. data/gumbo-parser/src/char_ref.rl +0 -2554
  50. data/gumbo-parser/src/string_piece.h +0 -38
  51. data/gumbo-parser/src/tag.in +0 -150
  52. data/gumbo-parser/src/tag_enum.h +0 -153
  53. data/gumbo-parser/src/tag_gperf.h +0 -105
  54. data/gumbo-parser/src/tag_sizes.h +0 -4
  55. data/gumbo-parser/src/tag_strings.h +0 -153
  56. data/gumbo-parser/visualc/include/strings.h +0 -4
  57. data/test-nokogumbo.rb +0 -190
@@ -1,32 +1,13 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Error types, enums, and handling functions.
18
-
19
1
  #ifndef GUMBO_ERROR_H_
20
2
  #define GUMBO_ERROR_H_
21
- #ifdef _MSC_VER
22
- #define _CRT_SECURE_NO_WARNINGS
23
- #endif
3
+
24
4
  #include <stdint.h>
25
5
 
26
6
  #include "gumbo.h"
27
7
  #include "insertion_mode.h"
28
8
  #include "string_buffer.h"
29
9
  #include "token_type.h"
10
+ #include "tokenizer_states.h"
30
11
 
31
12
  #ifdef __cplusplus
32
13
  extern "C" {
@@ -35,84 +16,66 @@ extern "C" {
35
16
  struct GumboInternalParser;
36
17
 
37
18
  typedef enum {
19
+ // Defined errors.
20
+ // https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
21
+ GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
22
+ GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
23
+ GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
24
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
25
+ GUMBO_ERR_CDATA_IN_HTML_CONTENT,
26
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
27
+ GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
28
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
29
+ GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
30
+ GUMBO_ERR_DUPLICATE_ATTRIBUTE,
31
+ GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
32
+ GUMBO_ERR_EOF_BEFORE_TAG_NAME,
33
+ GUMBO_ERR_EOF_IN_CDATA,
34
+ GUMBO_ERR_EOF_IN_COMMENT,
35
+ GUMBO_ERR_EOF_IN_DOCTYPE,
36
+ GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
37
+ GUMBO_ERR_EOF_IN_TAG,
38
+ GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
39
+ GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
40
+ GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
41
+ GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
42
+ GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
43
+ GUMBO_ERR_MISSING_DOCTYPE_NAME,
44
+ GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
45
+ GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
46
+ GUMBO_ERR_MISSING_END_TAG_NAME,
47
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
48
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
49
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
50
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
51
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
52
+ GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
53
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
54
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
55
+ GUMBO_ERR_NESTED_COMMENT,
56
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
57
+ GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
58
+ GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
59
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
60
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
61
+ GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
62
+ GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
63
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
64
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
65
+ GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
66
+ GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
67
+ GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
68
+ GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
69
+ GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
70
+
71
+ // Encoding errors.
38
72
  GUMBO_ERR_UTF8_INVALID,
39
73
  GUMBO_ERR_UTF8_TRUNCATED,
40
- GUMBO_ERR_UTF8_NULL,
41
- GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
42
- GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
43
- GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
44
- GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
45
- GUMBO_ERR_NAMED_CHAR_REF_INVALID,
46
- GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
47
- GUMBO_ERR_TAG_EOF,
48
- GUMBO_ERR_TAG_INVALID,
49
- GUMBO_ERR_CLOSE_TAG_EMPTY,
50
- GUMBO_ERR_CLOSE_TAG_EOF,
51
- GUMBO_ERR_CLOSE_TAG_INVALID,
52
- GUMBO_ERR_SCRIPT_EOF,
53
- GUMBO_ERR_ATTR_NAME_EOF,
54
- GUMBO_ERR_ATTR_NAME_INVALID,
55
- GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
56
- GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
57
- GUMBO_ERR_ATTR_UNQUOTED_EOF,
58
- GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
59
- GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
60
- GUMBO_ERR_ATTR_AFTER_EOF,
61
- GUMBO_ERR_ATTR_AFTER_INVALID,
62
- GUMBO_ERR_DUPLICATE_ATTR,
63
- GUMBO_ERR_SOLIDUS_EOF,
64
- GUMBO_ERR_SOLIDUS_INVALID,
65
- GUMBO_ERR_DASHES_OR_DOCTYPE,
66
- GUMBO_ERR_COMMENT_EOF,
67
- GUMBO_ERR_COMMENT_INVALID,
68
- GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
69
- GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
70
- GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
71
- GUMBO_ERR_COMMENT_END_BANG_EOF,
72
- GUMBO_ERR_DOCTYPE_EOF,
73
- GUMBO_ERR_DOCTYPE_INVALID,
74
- GUMBO_ERR_DOCTYPE_SPACE,
75
- GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
76
- GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
77
- GUMBO_ERR_DOCTYPE_END,
74
+
75
+ // Generic parser error.
78
76
  GUMBO_ERR_PARSER,
79
- GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
80
77
  } GumboErrorType;
81
78
 
82
- // Additional data for duplicated attributes.
83
- typedef struct GumboInternalDuplicateAttrError {
84
- // The name of the attribute. Owned by this struct.
85
- const char* name;
86
-
87
- // The (0-based) index within the attributes vector of the original
88
- // occurrence.
89
- unsigned int original_index;
90
-
91
- // The (0-based) index where the new occurrence would be.
92
- unsigned int new_index;
93
- } GumboDuplicateAttrError;
94
-
95
- // A simplified representation of the tokenizer state, designed to be more
96
- // useful to clients of this library than the internal representation. This
97
- // condenses the actual states used in the tokenizer state machine into a few
98
- // values that will be familiar to users of HTML.
99
- typedef enum {
100
- GUMBO_ERR_TOKENIZER_DATA,
101
- GUMBO_ERR_TOKENIZER_CHAR_REF,
102
- GUMBO_ERR_TOKENIZER_RCDATA,
103
- GUMBO_ERR_TOKENIZER_RAWTEXT,
104
- GUMBO_ERR_TOKENIZER_PLAINTEXT,
105
- GUMBO_ERR_TOKENIZER_SCRIPT,
106
- GUMBO_ERR_TOKENIZER_TAG,
107
- GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
108
- GUMBO_ERR_TOKENIZER_ATTR_NAME,
109
- GUMBO_ERR_TOKENIZER_ATTR_VALUE,
110
- GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
111
- GUMBO_ERR_TOKENIZER_COMMENT,
112
- GUMBO_ERR_TOKENIZER_DOCTYPE,
113
- GUMBO_ERR_TOKENIZER_CDATA,
114
- } GumboTokenizerErrorState;
115
-
116
79
  // Additional data for tokenizer errors.
117
80
  // This records the current state and codepoint encountered - this is usually
118
81
  // enough to reconstruct what went wrong and provide a friendly error message.
@@ -121,7 +84,7 @@ typedef struct GumboInternalTokenizerError {
121
84
  int codepoint;
122
85
 
123
86
  // The state that the tokenizer was in at the time.
124
- GumboTokenizerErrorState state;
87
+ GumboTokenizerEnum state;
125
88
  } GumboTokenizerError;
126
89
 
127
90
  // Additional data for parse errors.
@@ -129,61 +92,43 @@ typedef struct GumboInternalParserError {
129
92
  // The type of input token that resulted in this error.
130
93
  GumboTokenType input_type;
131
94
 
132
- // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
95
+ // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
133
96
  GumboTag input_tag;
134
97
 
135
98
  // The insertion mode that the parser was in at the time.
136
99
  GumboInsertionMode parser_state;
137
100
 
138
- // The tag stack at the point of the error. Note that this is an GumboVector
101
+ // The tag stack at the point of the error. Note that this is an GumboVector
139
102
  // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
140
103
  // get at the tag.
141
104
  GumboVector /* GumboTag */ tag_stack;
142
105
  } GumboParserError;
143
106
 
144
107
  // The overall error struct representing an error in decoding/tokenizing/parsing
145
- // the HTML. This contains an enumerated type flag, a source position, and then
108
+ // the HTML. This contains an enumerated type flag, a source position, and then
146
109
  // a union of fields containing data specific to the error.
147
- typedef struct GumboInternalError {
110
+ struct GumboInternalError {
148
111
  // The type of error.
149
112
  GumboErrorType type;
150
113
 
151
114
  // The position within the source file where the error occurred.
152
115
  GumboSourcePosition position;
153
116
 
154
- // A pointer to the byte within the original source file text where the error
155
- // occurred (note that this is not the same as position.offset, as that gives
156
- // character-based instead of byte-based offsets).
157
- const char* original_text;
117
+ // The piece of text that caused the error.
118
+ GumboStringPiece original_text;
158
119
 
159
120
  // Type-specific error information.
160
121
  union {
161
- // The code point we encountered, for:
162
- // * GUMBO_ERR_UTF8_INVALID
163
- // * GUMBO_ERR_UTF8_TRUNCATED
164
- // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
165
- // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
166
- uint64_t codepoint;
167
-
168
122
  // Tokenizer errors.
169
123
  GumboTokenizerError tokenizer;
170
124
 
171
- // Short textual data, for:
172
- // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
173
- // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
174
- GumboStringPiece text;
175
-
176
- // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
177
- GumboDuplicateAttrError duplicate_attr;
178
-
179
- // Parser state, for GUMBO_ERR_PARSER and
180
- // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
181
- struct GumboInternalParserError parser;
125
+ // Parser errors.
126
+ GumboParserError parser;
182
127
  } v;
183
- } GumboError;
128
+ };
184
129
 
185
130
  // Adds a new error to the parser's error list, and returns a pointer to it so
186
- // that clients can fill out the rest of its fields. May return NULL if we're
131
+ // that clients can fill out the rest of its fields. May return NULL if we're
187
132
  // already over the max_errors field specified in GumboOptions.
188
133
  GumboError* gumbo_add_error(struct GumboInternalParser* parser);
189
134
 
@@ -194,32 +139,10 @@ void gumbo_init_errors(struct GumboInternalParser* errors);
194
139
  void gumbo_destroy_errors(struct GumboInternalParser* errors);
195
140
 
196
141
  // Frees the memory used for a single GumboError.
197
- void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
198
-
199
- // Prints an error to a string. This fills an empty GumboStringBuffer with a
200
- // freshly-allocated buffer containing the error message text. The caller is
201
- // responsible for deleting the buffer. (Note that the buffer is allocated with
202
- // the allocator specified in the GumboParser config and hence should be freed
203
- // by gumbo_parser_deallocate().)
204
- void gumbo_error_to_string(struct GumboInternalParser* parser,
205
- const GumboError* error, GumboStringBuffer* output);
206
-
207
- // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
208
- // with a freshly-allocated buffer containing the error message text. The
209
- // caller is responsible for deleting the buffer. (Note that the buffer is
210
- // allocated with the allocator specified in the GumboParser config and hence
211
- // should be freed by gumbo_parser_deallocate().)
212
- void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
213
- const GumboError* error, const char* source_text,
214
- GumboStringBuffer* output);
215
-
216
- // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
217
- // of writing to a string.
218
- void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
219
- const GumboError* error, const char* source_text);
142
+ void gumbo_error_destroy(GumboError* error);
220
143
 
221
144
  #ifdef __cplusplus
222
145
  }
223
146
  #endif
224
147
 
225
- #endif // GUMBO_ERROR_H_
148
+ #endif // GUMBO_ERROR_H_
@@ -0,0 +1,104 @@
1
+ /* ANSI-C code produced by gperf version 3.1 */
2
+ /* Command-line: gperf -m100 -n lib/foreign_attrs.gperf */
3
+ /* Computed positions: -k'2,8' */
4
+ /* Filtered by: mk/gperf-filter.sed */
5
+
6
+ #include "replacement.h"
7
+ #include "macros.h"
8
+ #include <string.h>
9
+
10
+ #define TOTAL_KEYWORDS 11
11
+ #define MIN_WORD_LENGTH 5
12
+ #define MAX_WORD_LENGTH 13
13
+ #define MIN_HASH_VALUE 0
14
+ #define MAX_HASH_VALUE 10
15
+ /* maximum key range = 11, duplicates = 0 */
16
+
17
+ static inline unsigned int
18
+ hash (register const char *str, register size_t len)
19
+ {
20
+ static const unsigned char asso_values[] =
21
+ {
22
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
23
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
24
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
25
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
26
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
27
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
28
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
29
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
30
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
31
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 2,
32
+ 11, 10, 11, 9, 7, 6, 11, 11, 1, 0,
33
+ 11, 5, 11, 11, 4, 11, 11, 11, 11, 11,
34
+ 11, 3, 11, 11, 11, 11, 11, 11, 11, 11,
35
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
36
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
37
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
38
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
39
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
40
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
41
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
42
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
43
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
44
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
45
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
46
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
47
+ 11, 11, 11, 11, 11, 11
48
+ };
49
+ register unsigned int hval = 0;
50
+
51
+ switch (len)
52
+ {
53
+ default:
54
+ hval += asso_values[(unsigned char)str[7]];
55
+ /*FALLTHROUGH*/
56
+ case 7:
57
+ case 6:
58
+ case 5:
59
+ case 4:
60
+ case 3:
61
+ case 2:
62
+ hval += asso_values[(unsigned char)str[1]];
63
+ break;
64
+ }
65
+ return hval;
66
+ }
67
+
68
+ const ForeignAttrReplacement *
69
+ gumbo_get_foreign_attr_replacement (register const char *str, register size_t len)
70
+ {
71
+ static const unsigned char lengthtable[] =
72
+ {
73
+ 5, 11, 9, 13, 10, 10, 10, 11, 10, 8, 8
74
+ };
75
+ static const ForeignAttrReplacement wordlist[] =
76
+ {
77
+ {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
78
+ {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
79
+ {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
80
+ {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
81
+ {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
82
+ {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
83
+ {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
84
+ {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
85
+ {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
86
+ {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
87
+ {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}
88
+ };
89
+
90
+ if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
91
+ {
92
+ register unsigned int key = hash (str, len);
93
+
94
+ if (key <= MAX_HASH_VALUE)
95
+ if (len == lengthtable[key])
96
+ {
97
+ register const char *s = wordlist[key].from;
98
+
99
+ if (s && *str == *s && !memcmp (str + 1, s + 1, len - 1))
100
+ return &wordlist[key];
101
+ }
102
+ }
103
+ return 0;
104
+ }
@@ -1,51 +1,33 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18
- // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19
- // kGumbo prefix).
1
+ // Copyright 2010 Google Inc.
2
+ // Copyright 2018 Craig Barnes.
3
+ // Licensed under the Apache License, version 2.0.
4
+
5
+ // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
6
+ // GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
7
+ // static constants
20
8
 
21
9
  /**
22
10
  * @file
23
11
  * @mainpage Gumbo HTML Parser
24
12
  *
25
- * This provides a conformant, no-dependencies implementation of the HTML5
26
- * parsing algorithm. It supports only UTF8; if you need to parse a different
27
- * encoding, run a preprocessing step to convert to UTF8. It returns a parse
28
- * tree made of the structs in this file.
13
+ * This provides a conformant, no-dependencies implementation of the
14
+ * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
15
+ * to parse a different encoding, run a preprocessing step to convert
16
+ * to UTF-8. It returns a parse tree made of the structs in this file.
29
17
  *
30
18
  * Example:
31
19
  * @code
32
20
  * GumboOutput* output = gumbo_parse(input);
33
21
  * do_something_with_doctype(output->document);
34
22
  * do_something_with_html_tree(output->root);
35
- * gumbo_destroy_output(&options, output);
23
+ * gumbo_destroy_output(output);
36
24
  * @endcode
37
- * HTML5 Spec:
38
25
  *
39
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
26
+ * [HTML5]: https://html.spec.whatwg.org/multipage/
40
27
  */
41
28
 
42
- #ifndef GUMBO_GUMBO_H_
43
- #define GUMBO_GUMBO_H_
44
-
45
- #ifdef _MSC_VER
46
- #define _CRT_SECURE_NO_WARNINGS
47
- #define fileno _fileno
48
- #endif
29
+ #ifndef GUMBO_H
30
+ #define GUMBO_H
49
31
 
50
32
  #include <stdbool.h>
51
33
  #include <stddef.h>
@@ -55,73 +37,77 @@ extern "C" {
55
37
  #endif
56
38
 
57
39
  /**
58
- * A struct representing a character position within the original text buffer.
59
- * Line and column numbers are 1-based and offsets are 0-based, which matches
60
- * how most editors and command-line tools work. Also, columns measure
61
- * positions in terms of characters while offsets measure by bytes; this is
62
- * because the offset field is often used to pull out a particular region of
63
- * text (which in most languages that bind to C implies pointer arithmetic on a
64
- * buffer of bytes), while the column field is often used to reference a
65
- * particular column on a printable display, which nowadays is usually UTF-8.
40
+ * A struct representing a character position within the original text
41
+ * buffer. Line and column numbers are 1-based and offsets are 0-based,
42
+ * which matches how most editors and command-line tools work.
66
43
  */
67
44
  typedef struct {
68
- unsigned int line;
69
- unsigned int column;
70
- unsigned int offset;
45
+ size_t line;
46
+ size_t column;
47
+ size_t offset;
71
48
  } GumboSourcePosition;
72
49
 
73
50
  /**
74
- * A SourcePosition used for elements that have no source position, i.e.
75
- * parser-inserted elements.
76
- */
77
- extern const GumboSourcePosition kGumboEmptySourcePosition;
78
-
79
- /**
80
- * A struct representing a string or part of a string. Strings within the
81
- * parser are represented by a char* and a length; the char* points into
82
- * an existing data buffer owned by some other code (often the original input).
83
- * GumboStringPieces are assumed (by convention) to be immutable, because they
84
- * may share data. Use GumboStringBuffer if you need to construct a string.
85
- * Clients should assume that it is not NUL-terminated, and should always use
86
- * explicit lengths when manipulating them.
51
+ * A struct representing a string or part of a string. Strings within
52
+ * the parser are represented by a `char*` and a length; the `char*`
53
+ * points into an existing data buffer owned by some other code (often
54
+ * the original input). `GumboStringPiece`s are assumed (by convention)
55
+ * to be immutable, because they may share data. Clients should assume
56
+ * that it is not NUL-terminated and should always use explicit lengths
57
+ * when manipulating them.
87
58
  */
88
59
  typedef struct {
89
- /** A pointer to the beginning of the string. NULL iff length == 0. */
60
+ /** A pointer to the beginning of the string. `NULL` if `length == 0`. */
90
61
  const char* data;
91
62
 
92
- /** The length of the string fragment, in bytes. May be zero. */
63
+ /** The length of the string fragment, in bytes (may be zero). */
93
64
  size_t length;
94
65
  } GumboStringPiece;
95
66
 
67
+ #define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
96
68
  /** A constant to represent a 0-length null string. */
97
- extern const GumboStringPiece kGumboEmptyString;
69
+ #define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
98
70
 
99
71
  /**
100
- * Compares two GumboStringPieces, and returns true if they're equal or false
101
- * otherwise.
72
+ * Compares two `GumboStringPiece`s, and returns `true` if they're
73
+ * equal or `false` otherwise.
102
74
  */
103
- bool gumbo_string_equals(
104
- const GumboStringPiece* str1, const GumboStringPiece* str2);
75
+ bool gumbo_string_equals (
76
+ const GumboStringPiece* str1,
77
+ const GumboStringPiece* str2
78
+ );
105
79
 
106
80
  /**
107
- * Compares two GumboStringPieces ignoring case, and returns true if they're
108
- * equal or false otherwise.
81
+ * Compares two `GumboStringPiece`s, ignoring case, and returns `true`
82
+ * if they're equal or `false` otherwise.
109
83
  */
110
- bool gumbo_string_equals_ignore_case(
111
- const GumboStringPiece* str1, const GumboStringPiece* str2);
84
+ bool gumbo_string_equals_ignore_case (
85
+ const GumboStringPiece* str1,
86
+ const GumboStringPiece* str2
87
+ );
112
88
 
113
89
  /**
114
- * A simple vector implementation. This stores a pointer to a data array and a
115
- * length. All elements are stored as void*; client code must cast to the
116
- * appropriate type. Overflows upon addition result in reallocation of the data
117
- * array, with the size doubling to maintain O(1) amortized cost. There is no
118
- * removal function, as this isn't needed for any of the operations within this
119
- * library. Iteration can be done through inspecting the structure directly in
120
- * a for-loop.
90
+ * Check if the first `GumboStringPiece` is a prefix of the second, ignoring
91
+ * case.
92
+ */
93
+ bool gumbo_string_prefix_ignore_case (
94
+ const GumboStringPiece* prefix,
95
+ const GumboStringPiece* str
96
+ );
97
+
98
+ /**
99
+ * A simple vector implementation. This stores a pointer to a data array
100
+ * and a length. All elements are stored as `void*`; client code must
101
+ * cast to the appropriate type. Overflows upon addition result in
102
+ * reallocation of the data array, with the size doubling to maintain
103
+ * `O(1)` amortized cost. There is no removal function, as this isn't
104
+ * needed for any of the operations within this library. Iteration can
105
+ * be done through inspecting the structure directly in a `for` loop.
121
106
  */
122
107
  typedef struct {
123
- /** Data elements. This points to a dynamically-allocated array of capacity
124
- * elements, each a void* to the element itself.
108
+ /**
109
+ * Data elements. This points to a dynamically-allocated array of
110
+ * `capacity` elements, each a `void*` to the element itself.
125
111
  */
126
112
  void** data;
127
113
 
@@ -132,82 +118,230 @@ typedef struct {
132
118
  unsigned int capacity;
133
119
  } GumboVector;
134
120
 
135
- /** An empty (0-length, 0-capacity) GumboVector. */
136
- extern const GumboVector kGumboEmptyVector;
121
+ # define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
122
+ /** An empty (0-length, 0-capacity) `GumboVector`. */
123
+ #define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
137
124
 
138
125
  /**
139
- * Returns the first index at which an element appears in this vector (testing
140
- * by pointer equality), or -1 if it never does.
126
+ * Returns the first index at which an element appears in this vector
127
+ * (testing by pointer equality), or `-1` if it never does.
141
128
  */
142
129
  int gumbo_vector_index_of(GumboVector* vector, const void* element);
143
130
 
144
131
  /**
145
- * An enum for all the tags defined in the HTML5 standard. These correspond to
146
- * the tag names themselves. Enum constants exist only for tags which appear in
147
- * the spec itself (or for tags with special handling in the SVG and MathML
148
- * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
149
- * name can be obtained through original_tag.
132
+ * An `enum` for all the tags defined in the HTML5 standard. These
133
+ * correspond to the tag names themselves. Enum constants exist only
134
+ * for tags that appear in the spec itself (or for tags with special
135
+ * handling in the SVG and MathML namespaces). Any other tags appear
136
+ * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
137
+ * through `original_tag`.
150
138
  *
151
- * This is mostly for API convenience, so that clients of this library don't
152
- * need to perform a strcasecmp to find the normalized tag name. It also has
153
- * efficiency benefits, by letting the parser work with enums instead of
154
- * strings.
139
+ * This is mostly for API convenience, so that clients of this library
140
+ * don't need to perform a `strcasecmp` to find the normalized tag
141
+ * name. It also has efficiency benefits, by letting the parser work
142
+ * with enums instead of strings.
155
143
  */
156
144
  typedef enum {
157
- // Load all the tags from an external source, generated from tag.in.
158
- #include "tag_enum.h"
159
- // Used for all tags that don't have special handling in HTML. Add new tags
160
- // to the end of tag.in so as to preserve backwards-compatibility.
145
+ GUMBO_TAG_HTML,
146
+ GUMBO_TAG_HEAD,
147
+ GUMBO_TAG_TITLE,
148
+ GUMBO_TAG_BASE,
149
+ GUMBO_TAG_LINK,
150
+ GUMBO_TAG_META,
151
+ GUMBO_TAG_STYLE,
152
+ GUMBO_TAG_SCRIPT,
153
+ GUMBO_TAG_NOSCRIPT,
154
+ GUMBO_TAG_TEMPLATE,
155
+ GUMBO_TAG_BODY,
156
+ GUMBO_TAG_ARTICLE,
157
+ GUMBO_TAG_SECTION,
158
+ GUMBO_TAG_NAV,
159
+ GUMBO_TAG_ASIDE,
160
+ GUMBO_TAG_H1,
161
+ GUMBO_TAG_H2,
162
+ GUMBO_TAG_H3,
163
+ GUMBO_TAG_H4,
164
+ GUMBO_TAG_H5,
165
+ GUMBO_TAG_H6,
166
+ GUMBO_TAG_HGROUP,
167
+ GUMBO_TAG_HEADER,
168
+ GUMBO_TAG_FOOTER,
169
+ GUMBO_TAG_ADDRESS,
170
+ GUMBO_TAG_P,
171
+ GUMBO_TAG_HR,
172
+ GUMBO_TAG_PRE,
173
+ GUMBO_TAG_BLOCKQUOTE,
174
+ GUMBO_TAG_OL,
175
+ GUMBO_TAG_UL,
176
+ GUMBO_TAG_LI,
177
+ GUMBO_TAG_DL,
178
+ GUMBO_TAG_DT,
179
+ GUMBO_TAG_DD,
180
+ GUMBO_TAG_FIGURE,
181
+ GUMBO_TAG_FIGCAPTION,
182
+ GUMBO_TAG_MAIN,
183
+ GUMBO_TAG_DIV,
184
+ GUMBO_TAG_A,
185
+ GUMBO_TAG_EM,
186
+ GUMBO_TAG_STRONG,
187
+ GUMBO_TAG_SMALL,
188
+ GUMBO_TAG_S,
189
+ GUMBO_TAG_CITE,
190
+ GUMBO_TAG_Q,
191
+ GUMBO_TAG_DFN,
192
+ GUMBO_TAG_ABBR,
193
+ GUMBO_TAG_DATA,
194
+ GUMBO_TAG_TIME,
195
+ GUMBO_TAG_CODE,
196
+ GUMBO_TAG_VAR,
197
+ GUMBO_TAG_SAMP,
198
+ GUMBO_TAG_KBD,
199
+ GUMBO_TAG_SUB,
200
+ GUMBO_TAG_SUP,
201
+ GUMBO_TAG_I,
202
+ GUMBO_TAG_B,
203
+ GUMBO_TAG_U,
204
+ GUMBO_TAG_MARK,
205
+ GUMBO_TAG_RUBY,
206
+ GUMBO_TAG_RT,
207
+ GUMBO_TAG_RP,
208
+ GUMBO_TAG_BDI,
209
+ GUMBO_TAG_BDO,
210
+ GUMBO_TAG_SPAN,
211
+ GUMBO_TAG_BR,
212
+ GUMBO_TAG_WBR,
213
+ GUMBO_TAG_INS,
214
+ GUMBO_TAG_DEL,
215
+ GUMBO_TAG_IMAGE,
216
+ GUMBO_TAG_IMG,
217
+ GUMBO_TAG_IFRAME,
218
+ GUMBO_TAG_EMBED,
219
+ GUMBO_TAG_OBJECT,
220
+ GUMBO_TAG_PARAM,
221
+ GUMBO_TAG_VIDEO,
222
+ GUMBO_TAG_AUDIO,
223
+ GUMBO_TAG_SOURCE,
224
+ GUMBO_TAG_TRACK,
225
+ GUMBO_TAG_CANVAS,
226
+ GUMBO_TAG_MAP,
227
+ GUMBO_TAG_AREA,
228
+ GUMBO_TAG_MATH,
229
+ GUMBO_TAG_MI,
230
+ GUMBO_TAG_MO,
231
+ GUMBO_TAG_MN,
232
+ GUMBO_TAG_MS,
233
+ GUMBO_TAG_MTEXT,
234
+ GUMBO_TAG_MGLYPH,
235
+ GUMBO_TAG_MALIGNMARK,
236
+ GUMBO_TAG_ANNOTATION_XML,
237
+ GUMBO_TAG_SVG,
238
+ GUMBO_TAG_FOREIGNOBJECT,
239
+ GUMBO_TAG_DESC,
240
+ GUMBO_TAG_TABLE,
241
+ GUMBO_TAG_CAPTION,
242
+ GUMBO_TAG_COLGROUP,
243
+ GUMBO_TAG_COL,
244
+ GUMBO_TAG_TBODY,
245
+ GUMBO_TAG_THEAD,
246
+ GUMBO_TAG_TFOOT,
247
+ GUMBO_TAG_TR,
248
+ GUMBO_TAG_TD,
249
+ GUMBO_TAG_TH,
250
+ GUMBO_TAG_FORM,
251
+ GUMBO_TAG_FIELDSET,
252
+ GUMBO_TAG_LEGEND,
253
+ GUMBO_TAG_LABEL,
254
+ GUMBO_TAG_INPUT,
255
+ GUMBO_TAG_BUTTON,
256
+ GUMBO_TAG_SELECT,
257
+ GUMBO_TAG_DATALIST,
258
+ GUMBO_TAG_OPTGROUP,
259
+ GUMBO_TAG_OPTION,
260
+ GUMBO_TAG_TEXTAREA,
261
+ GUMBO_TAG_KEYGEN,
262
+ GUMBO_TAG_OUTPUT,
263
+ GUMBO_TAG_PROGRESS,
264
+ GUMBO_TAG_METER,
265
+ GUMBO_TAG_DETAILS,
266
+ GUMBO_TAG_SUMMARY,
267
+ GUMBO_TAG_MENU,
268
+ GUMBO_TAG_MENUITEM,
269
+ GUMBO_TAG_APPLET,
270
+ GUMBO_TAG_ACRONYM,
271
+ GUMBO_TAG_BGSOUND,
272
+ GUMBO_TAG_DIR,
273
+ GUMBO_TAG_FRAME,
274
+ GUMBO_TAG_FRAMESET,
275
+ GUMBO_TAG_NOFRAMES,
276
+ GUMBO_TAG_LISTING,
277
+ GUMBO_TAG_XMP,
278
+ GUMBO_TAG_NEXTID,
279
+ GUMBO_TAG_NOEMBED,
280
+ GUMBO_TAG_PLAINTEXT,
281
+ GUMBO_TAG_RB,
282
+ GUMBO_TAG_STRIKE,
283
+ GUMBO_TAG_BASEFONT,
284
+ GUMBO_TAG_BIG,
285
+ GUMBO_TAG_BLINK,
286
+ GUMBO_TAG_CENTER,
287
+ GUMBO_TAG_FONT,
288
+ GUMBO_TAG_MARQUEE,
289
+ GUMBO_TAG_MULTICOL,
290
+ GUMBO_TAG_NOBR,
291
+ GUMBO_TAG_SPACER,
292
+ GUMBO_TAG_TT,
293
+ GUMBO_TAG_RTC,
294
+ GUMBO_TAG_DIALOG,
295
+ // Used for all tags that don't have special handling in HTML.
161
296
  GUMBO_TAG_UNKNOWN,
162
297
  // A marker value to indicate the end of the enum, for iterating over it.
163
- // Also used as the terminator for varargs functions that take tags.
164
298
  GUMBO_TAG_LAST,
165
299
  } GumboTag;
166
300
 
167
301
  /**
168
- * Returns the normalized (usually all-lowercased, except for foreign content)
169
- * tag name for an GumboTag enum. Return value is static data owned by the
170
- * library.
302
+ * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
303
+ * return value is static data owned by the library.
171
304
  */
172
305
  const char* gumbo_normalized_tagname(GumboTag tag);
173
306
 
174
307
  /**
175
- * Extracts the tag name from the original_text field of an element or token by
176
- * stripping off </> characters and attributes and adjusting the passed-in
177
- * GumboStringPiece appropriately. The tag name is in the original case and
178
- * shares a buffer with the original text, to simplify memory management.
179
- * Behavior is undefined if a string-piece that doesn't represent an HTML tag
180
- * (<tagname> or </tagname>) is passed in. If the string piece is completely
181
- * empty (NULL data pointer), then this function will exit successfully as a
182
- * no-op.
308
+ * Extracts the tag name from the `original_text` field of an element
309
+ * or token by stripping off `</>` characters and attributes and
310
+ * adjusting the passed-in `GumboStringPiece` appropriately. The tag
311
+ * name is in the original case and shares a buffer with the original
312
+ * text, to simplify memory management. Behavior is undefined if a
313
+ * string piece that doesn't represent an HTML tag (`<tagname>` or
314
+ * `</tagname>`) is passed in. If the string piece is completely
315
+ * empty (`NULL` data pointer), then this function will exit
316
+ * successfully as a no-op.
183
317
  */
184
318
  void gumbo_tag_from_original_text(GumboStringPiece* text);
185
319
 
186
320
  /**
187
- * Fixes the case of SVG elements that are not all lowercase.
188
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
189
- * This is not done at parse time because there's no place to store a mutated
190
- * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
191
- * without special handling), while original_tag_name is a pointer into the
192
- * original buffer. Instead, we provide this helper function that clients can
193
- * use to rename SVG tags as appropriate.
194
- * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
195
- * no normalization is called for. The return value is static data and owned by
196
- * the library.
321
+ * Fixes the case of SVG elements that are not all lowercase. This is
322
+ * not done at parse time because there's no place to store a mutated
323
+ * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
324
+ * SVG tags without special handling), while `original_tag_name` is a
325
+ * pointer into the original buffer. Instead, we provide this helper
326
+ * function that clients can use to rename SVG tags as appropriate.
327
+ * Returns the case-normalized SVG tagname if a replacement is found, or
328
+ * `NULL` if no normalization is called for. The return value is static
329
+ * data and owned by the library.
330
+ *
331
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
197
332
  */
198
333
  const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
199
334
 
200
335
  /**
201
- * Converts a tag name string (which may be in upper or mixed case) to a tag
202
- * enum. The `tag` version expects `tagname` to be NULL-terminated
336
+ * Converts a tag name string (which may be in upper or mixed case) to a
337
+ * tag enum.
203
338
  */
204
- GumboTag gumbo_tag_enum(const char* tagname);
205
- GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
339
+ GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
206
340
 
207
341
  /**
208
342
  * Attribute namespaces.
209
- * HTML includes special handling for XLink, XML, and XMLNS namespaces on
210
- * attributes. Everything else goes in the generic "NONE" namespace.
343
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces
344
+ * on attributes. Everything else goes in the generic "NONE" namespace.
211
345
  */
212
346
  typedef enum {
213
347
  GUMBO_ATTR_NAMESPACE_NONE,
@@ -217,46 +351,47 @@ typedef enum {
217
351
  } GumboAttributeNamespaceEnum;
218
352
 
219
353
  /**
220
- * A struct representing a single attribute on an HTML tag. This is a
221
- * name-value pair, but also includes information about source locations and
222
- * original source text.
354
+ * A struct representing a single attribute on a HTML tag. This is a
355
+ * name-value pair, but also includes information about source locations
356
+ * and original source text.
223
357
  */
224
358
  typedef struct {
225
359
  /**
226
- * The namespace for the attribute. This will usually be
227
- * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
228
- * values, per:
229
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
360
+ * The namespace for the attribute. This will usually be
361
+ * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
362
+ * take special values, per:
363
+ * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
230
364
  */
231
365
  GumboAttributeNamespaceEnum attr_namespace;
232
366
 
233
367
  /**
234
- * The name of the attribute. This is in a freshly-allocated buffer to deal
235
- * with case-normalization, and is null-terminated.
368
+ * The name of the attribute. This is in a freshly-allocated buffer to
369
+ * deal with case-normalization and is null-terminated.
236
370
  */
237
371
  const char* name;
238
372
 
239
373
  /**
240
- * The original text of the attribute name, as a pointer into the original
241
- * source buffer.
374
+ * The original text of the attribute name, as a pointer into the
375
+ * original source buffer.
242
376
  */
243
377
  GumboStringPiece original_name;
244
378
 
245
379
  /**
246
- * The value of the attribute. This is in a freshly-allocated buffer to deal
247
- * with unescaping, and is null-terminated. It does not include any quotes
248
- * that surround the attribute. If the attribute has no value (for example,
249
- * 'selected' on a checkbox), this will be an empty string.
380
+ * The value of the attribute. This is in a freshly-allocated buffer
381
+ * to deal with unescaping and is null-terminated. It does not include
382
+ * any quotes that surround the attribute. If the attribute has no
383
+ * value (for example, `selected` on a checkbox) this will be an empty
384
+ * string.
250
385
  */
251
386
  const char* value;
252
387
 
253
388
  /**
254
- * The original text of the value of the attribute. This points into the
255
- * original source buffer. It includes any quotes that surround the
256
- * attribute, and you can look at original_value.data[0] and
257
- * original_value.data[original_value.length - 1] to determine what the quote
258
- * characters were. If the attribute has no value, this will be a 0-length
259
- * string.
389
+ * The original text of the value of the attribute. This points into
390
+ * the original source buffer. It includes any quotes that surround
391
+ * the attribute and you can look at `original_value.data[0]` and
392
+ * `original_value.data[original_value.length - 1]` to determine what
393
+ * the quote characters were. If the attribute has no value this will
394
+ * be a 0-length string.
260
395
  */
261
396
  GumboStringPiece original_value;
262
397
 
@@ -264,9 +399,9 @@ typedef struct {
264
399
  GumboSourcePosition name_start;
265
400
 
266
401
  /**
267
- * The ending position of the attribute name. This is not always derivable
402
+ * The ending position of the attribute name. This is not always derivable
268
403
  * from the starting position of the value because of the possibility of
269
- * whitespace around the = sign.
404
+ * whitespace around the `=` sign.
270
405
  */
271
406
  GumboSourcePosition name_end;
272
407
 
@@ -278,34 +413,37 @@ typedef struct {
278
413
  } GumboAttribute;
279
414
 
280
415
  /**
281
- * Given a vector of GumboAttributes, look up the one with the specified name
282
- * and return it, or NULL if no such attribute exists. This uses a
283
- * case-insensitive match, as HTML is case-insensitive.
416
+ * Given a vector of `GumboAttribute`s, look up the one with the
417
+ * specified name and return it, or `NULL` if no such attribute exists.
418
+ * This uses a case-insensitive match, as HTML is case-insensitive.
284
419
  */
285
420
  GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
286
421
 
287
422
  /**
288
- * Enum denoting the type of node. This determines the type of the node.v
289
- * union.
423
+ * Enum denoting the type of node. This determines the type of the
424
+ * `node.v` union.
290
425
  */
291
426
  typedef enum {
292
- /** Document node. v will be a GumboDocument. */
427
+ /** Document node. `v` will be a `GumboDocument`. */
293
428
  GUMBO_NODE_DOCUMENT,
294
- /** Element node. v will be a GumboElement. */
429
+ /** Element node. `v` will be a `GumboElement`. */
295
430
  GUMBO_NODE_ELEMENT,
296
- /** Text node. v will be a GumboText. */
431
+ /** Text node. `v` will be a `GumboText`. */
297
432
  GUMBO_NODE_TEXT,
298
- /** CDATA node. v will be a GumboText. */
433
+ /** CDATA node. `v` will be a `GumboText`. */
299
434
  GUMBO_NODE_CDATA,
300
- /** Comment node. v will be a GumboText, excluding comment delimiters. */
435
+ /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
301
436
  GUMBO_NODE_COMMENT,
302
- /** Text node, where all contents is whitespace. v will be a GumboText. */
437
+ /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
303
438
  GUMBO_NODE_WHITESPACE,
304
- /** Template node. This is separate from GUMBO_NODE_ELEMENT because many
305
- * client libraries will want to ignore the contents of template nodes, as
306
- * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
307
- * here, while clients that want to include template contents should also
308
- * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
439
+ /**
440
+ * Template node. This is separate from `GUMBO_NODE_ELEMENT` because
441
+ * many client libraries will want to ignore the contents of template
442
+ * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
443
+ * do the right thing here, while clients that want to include template
444
+ * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
445
+ * `GumboElement`.
446
+ */
309
447
  GUMBO_NODE_TEMPLATE
310
448
  } GumboNodeType;
311
449
 
@@ -315,9 +453,7 @@ typedef enum {
315
453
  */
316
454
  typedef struct GumboInternalNode GumboNode;
317
455
 
318
- /**
319
- * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
320
- */
456
+ /** https://dom.spec.whatwg.org/#concept-document-quirks */
321
457
  typedef enum {
322
458
  GUMBO_DOCTYPE_NO_QUIRKS,
323
459
  GUMBO_DOCTYPE_QUIRKS,
@@ -326,10 +462,11 @@ typedef enum {
326
462
 
327
463
  /**
328
464
  * Namespaces.
329
- * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
330
- * anything inside an <svg> tag is in the SVG namespace, anything inside the
331
- * <math> tag is in the MathML namespace, and anything else is inside the HTML
332
- * namespace. No other namespaces are supported, so this can be an enum only.
465
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
466
+ * Rather, anything inside an `<svg>` tag is in the SVG namespace,
467
+ * anything inside the `<math>` tag is in the MathML namespace, and
468
+ * anything else is inside the HTML namespace. No other namespaces are
469
+ * supported, so this can be an `enum`.
333
470
  */
334
471
  typedef enum {
335
472
  GUMBO_NAMESPACE_HTML,
@@ -339,66 +476,70 @@ typedef enum {
339
476
 
340
477
  /**
341
478
  * Parse flags.
342
- * We track the reasons for parser insertion of nodes and store them in a
343
- * bitvector in the node itself. This lets client code optimize out nodes that
344
- * are implied by the HTML structure of the document, or flag constructs that
345
- * may not be allowed by a style guide, or track the prevalence of incorrect or
346
- * tricky HTML code.
479
+ * We track the reasons for parser insertion of nodes and store them in
480
+ * a bitvector in the node itself. This lets client code optimize out
481
+ * nodes that are implied by the HTML structure of the document, or flag
482
+ * constructs that may not be allowed by a style guide, or track the
483
+ * prevalence of incorrect or tricky HTML code.
347
484
  */
348
485
  typedef enum {
349
486
  /**
350
- * A normal node - both start and end tags appear in the source, nothing has
351
- * been reparented.
487
+ * A normal node -- both start and end tags appear in the source,
488
+ * nothing has been reparented.
352
489
  */
353
490
  GUMBO_INSERTION_NORMAL = 0,
354
491
 
355
492
  /**
356
- * A node inserted by the parser to fulfill some implicit insertion rule.
357
- * This is usually set in addition to some other flag giving a more specific
358
- * insertion reason; it's a generic catch-all term meaning "The start tag for
359
- * this node did not appear in the document source".
493
+ * A node inserted by the parser to fulfill some implicit insertion
494
+ * rule. This is usually set in addition to some other flag giving a
495
+ * more specific insertion reason; it's a generic catch-all term
496
+ * meaning "The start tag for this node did not appear in the document
497
+ * source".
360
498
  */
361
499
  GUMBO_INSERTION_BY_PARSER = 1 << 0,
362
500
 
363
501
  /**
364
- * A flag indicating that the end tag for this node did not appear in the
365
- * document source. Note that in some cases, you can still have
366
- * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
367
- * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
368
- * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
369
- * exists. This flag will be set only if the end tag is completely missing;
370
- * in some cases, the end tag may be misplaced (eg. a </body> tag with text
371
- * afterwards), which will leave this flag unset and require clients to
372
- * inspect the parse errors for that case.
502
+ * A flag indicating that the end tag for this node did not appear in
503
+ * the document source. Note that in some cases, you can still have
504
+ * parser-inserted nodes with an explicit end tag. For example,
505
+ * `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
506
+ * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
507
+ * `</html>` tag actually exists.
508
+ *
509
+ * This flag will be set only if the end tag is completely missing.
510
+ * In some cases, the end tag may be misplaced (e.g. a `</body>` tag
511
+ * with text afterwards), which will leave this flag unset and require
512
+ * clients to inspect the parse errors for that case.
373
513
  */
374
514
  GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
375
515
 
376
516
  // Value 1 << 2 was for a flag that has since been removed.
377
517
 
378
518
  /**
379
- * A flag for nodes that are inserted because their presence is implied by
380
- * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
519
+ * A flag for nodes that are inserted because their presence is
520
+ * implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
521
+ * `<tbody>`, etc.
381
522
  */
382
523
  GUMBO_INSERTION_IMPLIED = 1 << 3,
383
524
 
384
525
  /**
385
- * A flag for nodes that are converted from their end tag equivalents. For
386
- * example, </p> when no paragraph is open implies that the parser should
387
- * create a <p> tag and immediately close it, while </br> means the same thing
388
- * as <br>.
526
+ * A flag for nodes that are converted from their end tag equivalents.
527
+ * For example, `</p>` when no paragraph is open implies that the
528
+ * parser should create a `<p>` tag and immediately close it, while
529
+ * `</br>` means the same thing as `<br>`.
389
530
  */
390
531
  GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
391
532
 
392
- /** A flag for nodes that are converted from the parse of an <isindex> tag. */
393
- GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
533
+ // Value 1 << 5 was for a flag that has since been removed.
394
534
 
395
- /** A flag for <image> tags that are rewritten as <img>. */
535
+ /** A flag for `<image>` tags that are rewritten as `<img>`. */
396
536
  GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
397
537
 
398
538
  /**
399
- * A flag for nodes that are cloned as a result of the reconstruction of
400
- * active formatting elements. This is set only on the clone; the initial
401
- * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
539
+ * A flag for nodes that are cloned as a result of the reconstruction
540
+ * of active formatting elements. This is set only on the clone; the
541
+ * initial portion of the formatting run is a NORMAL node with an
542
+ * `IMPLICIT_END_TAG`.
402
543
  */
403
544
  GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
404
545
 
@@ -415,18 +556,19 @@ typedef enum {
415
556
  GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
416
557
  } GumboParseFlags;
417
558
 
418
- /**
419
- * Information specific to document nodes.
420
- */
559
+ /** Information specific to document nodes. */
421
560
  typedef struct {
422
561
  /**
423
- * An array of GumboNodes, containing the children of this element. This will
424
- * normally consist of the <html> element and any comment nodes found.
425
- * Pointers are owned.
562
+ * An array of `GumboNode`s, containing the children of this element.
563
+ * This will normally consist of the `<html>` element and any comment
564
+ * nodes found. Pointers are owned.
426
565
  */
427
566
  GumboVector /* GumboNode* */ children;
428
567
 
429
- // True if there was an explicit doctype token as opposed to it being omitted.
568
+ /**
569
+ * `true` if there was an explicit doctype token, as opposed to it
570
+ * being omitted.
571
+ */
430
572
  bool has_doctype;
431
573
 
432
574
  // Fields from the doctype token, copied verbatim.
@@ -435,65 +577,70 @@ typedef struct {
435
577
  const char* system_identifier;
436
578
 
437
579
  /**
438
- * Whether or not the document is in QuirksMode, as determined by the values
439
- * in the GumboTokenDocType template.
580
+ * Whether or not the document is in QuirksMode, as determined by the
581
+ * values in the GumboTokenDocType template.
440
582
  */
441
583
  GumboQuirksModeEnum doc_type_quirks_mode;
442
584
  } GumboDocument;
443
585
 
444
586
  /**
445
- * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
446
- * This contains just a block of text and its position.
587
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
588
+ * elements. This contains just a block of text and its position.
447
589
  */
448
590
  typedef struct {
449
591
  /**
450
- * The text of this node, after entities have been parsed and decoded. For
451
- * comment/cdata nodes, this does not include the comment delimiters.
592
+ * The text of this node, after entities have been parsed and decoded.
593
+ * For comment and cdata nodes, this does not include the comment
594
+ * delimiters.
452
595
  */
453
596
  const char* text;
454
597
 
455
598
  /**
456
- * The original text of this node, as a pointer into the original buffer. For
457
- * comment/cdata nodes, this includes the comment delimiters.
599
+ * The original text of this node, as a pointer into the original
600
+ * buffer. For comment/cdata nodes, this includes the comment
601
+ * delimiters.
458
602
  */
459
603
  GumboStringPiece original_text;
460
604
 
461
605
  /**
462
- * The starting position of this node. This corresponds to the position of
463
- * original_text, before entities are decoded.
606
+ * The starting position of this node. This corresponds to the
607
+ * position of `original_text`, before entities are decoded.
464
608
  * */
465
609
  GumboSourcePosition start_pos;
466
610
  } GumboText;
467
611
 
468
612
  /**
469
- * The struct used to represent all HTML elements. This contains information
470
- * about the tag, attributes, and child nodes.
613
+ * The struct used to represent all HTML elements. This contains
614
+ * information about the tag, attributes, and child nodes.
471
615
  */
472
616
  typedef struct {
473
617
  /**
474
- * An array of GumboNodes, containing the children of this element. Pointers
475
- * are owned.
618
+ * An array of `GumboNode`s, containing the children of this element.
619
+ * Pointers are owned.
476
620
  */
477
621
  GumboVector /* GumboNode* */ children;
478
622
 
479
623
  /** The GumboTag enum for this element. */
480
624
  GumboTag tag;
481
625
 
626
+ /** The name for this element. */
627
+ const char* name;
628
+
482
629
  /** The GumboNamespaceEnum for this element. */
483
630
  GumboNamespaceEnum tag_namespace;
484
631
 
485
632
  /**
486
- * A GumboStringPiece pointing to the original tag text for this element,
487
- * pointing directly into the source buffer. If the tag was inserted
488
- * algorithmically (for example, <head> or <tbody> insertion), this will be a
489
- * zero-length string.
633
+ * A `GumboStringPiece` pointing to the original tag text for this
634
+ * element, pointing directly into the source buffer. If the tag was
635
+ * inserted algorithmically (for example, `<head>` or `<tbody>`
636
+ * insertion), this will be a zero-length string.
490
637
  */
491
638
  GumboStringPiece original_tag;
492
639
 
493
640
  /**
494
- * A GumboStringPiece pointing to the original end tag text for this element.
495
- * If the end tag was inserted algorithmically, (for example, closing a
496
- * self-closing tag), this will be a zero-length string.
641
+ * A `GumboStringPiece` pointing to the original end tag text for this
642
+ * element. If the end tag was inserted algorithmically, (for example,
643
+ * closing a self-closing tag), this will be a zero-length string.
497
644
  */
498
645
  GumboStringPiece original_end_tag;
499
646
 
@@ -504,30 +651,31 @@ typedef struct {
504
651
  GumboSourcePosition end_pos;
505
652
 
506
653
  /**
507
- * An array of GumboAttributes, containing the attributes for this tag in the
508
- * order that they were parsed. Pointers are owned.
654
+ * An array of `GumboAttribute`s, containing the attributes for this
655
+ * tag in the order that they were parsed. Pointers are owned.
509
656
  */
510
657
  GumboVector /* GumboAttribute* */ attributes;
511
658
  } GumboElement;
512
659
 
513
660
  /**
514
- * A supertype for GumboElement and GumboText, so that we can include one
515
- * generic type in lists of children and cast as necessary to subtypes.
661
+ * A supertype for `GumboElement` and `GumboText`, so that we can
662
+ * include one generic type in lists of children and cast as necessary
663
+ * to subtypes.
516
664
  */
517
665
  struct GumboInternalNode {
518
666
  /** The type of node that this is. */
519
667
  GumboNodeType type;
520
668
 
521
- /** Pointer back to parent node. Not owned. */
669
+ /** Pointer back to parent node. Not owned. */
522
670
  GumboNode* parent;
523
671
 
524
672
  /** The index within the parent's children vector of this node. */
525
- size_t index_within_parent;
673
+ unsigned int index_within_parent;
526
674
 
527
675
  /**
528
- * A bitvector of flags containing information about why this element was
529
- * inserted into the parse tree, including a variety of special parse
530
- * situations.
676
+ * A bitvector of flags containing information about why this element
677
+ * was inserted into the parse tree, including a variety of special
678
+ * parse situations.
531
679
  */
532
680
  GumboParseFlags parse_flags;
533
681
 
@@ -539,133 +687,257 @@ struct GumboInternalNode {
539
687
  } v;
540
688
  };
541
689
 
542
- /**
543
- * The type for an allocator function. Takes the 'userdata' member of the
544
- * GumboParser struct as its first argument. Semantics should be the same as
545
- * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
546
- * Allocating a block of 0 bytes behaves as per malloc.
547
- */
548
- // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
549
- typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
550
-
551
- /**
552
- * The type for a deallocator function. Takes the 'userdata' member of the
553
- * GumboParser struct as its first argument.
554
- */
555
- typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
556
-
557
690
  /**
558
691
  * Input struct containing configuration options for the parser.
559
- * These let you specify alternate memory managers, provide different error
560
- * handling, etc.
561
- * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
692
+ * These let you specify alternate memory managers, provide different
693
+ * error handling, etc. Use `kGumboDefaultOptions` for sensible
694
+ * defaults and only set what you need.
562
695
  */
563
696
  typedef struct GumboInternalOptions {
564
- /** A memory allocator function. Default: malloc. */
565
- GumboAllocatorFunction allocator;
566
-
567
- /** A memory deallocator function. Default: free. */
568
- GumboDeallocatorFunction deallocator;
697
+ /**
698
+ * The tab-stop size, for computing positions in HTML files that
699
+ * use tabs. Default: `8`.
700
+ */
701
+ int tab_stop;
569
702
 
570
703
  /**
571
- * An opaque object that's passed in as the first argument to all callbacks
572
- * used by this library. Default: NULL.
704
+ * Whether or not to stop parsing when the first error is encountered.
705
+ * Default: `false`.
573
706
  */
574
- void* userdata;
707
+ bool stop_on_first_error;
575
708
 
576
709
  /**
577
- * The tab-stop size, for computing positions in source code that uses tabs.
578
- * Default: 8.
710
+ * Maximum allowed number of attributes per element. If this limit is
711
+ * exceeded, the parser will return early with a partial document and
712
+ * the returned `GumboOutput` will have its `status` field set to
713
+ * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
714
+ * Default: `400`.
579
715
  */
580
- int tab_stop;
716
+ int max_attributes;
581
717
 
582
718
  /**
583
- * Whether or not to stop parsing when the first error is encountered.
584
- * Default: false.
719
+ * Maximum allowed depth for the parse tree. If this limit is exceeded,
720
+ * the parser will return early with a partial document and the returned
721
+ * `GumboOutput` will have its `status` field set to
722
+ * `GUMBO_STATUS_TREE_TOO_DEEP`.
723
+ * Default: `400`.
585
724
  */
586
- bool stop_on_first_error;
725
+ unsigned int max_tree_depth;
587
726
 
588
727
  /**
589
- * The maximum number of errors before the parser stops recording them. This
590
- * is provided so that if the page is totally borked, we don't completely fill
591
- * up the errors vector and exhaust memory with useless redundant errors. Set
592
- * to -1 to disable the limit.
593
- * Default: -1
728
+ * The maximum number of errors before the parser stops recording
729
+ * them. This is provided so that if the page is totally borked, we
730
+ * don't completely fill up the errors vector and exhaust memory with
731
+ * useless redundant errors. Set to `-1` to disable the limit.
732
+ * Default: `-1`.
594
733
  */
595
734
  int max_errors;
596
735
 
597
736
  /**
598
737
  * The fragment context for parsing:
599
- * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
738
+ * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
600
739
  *
601
- * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
602
- * the regular parsing algorithm. Otherwise, pass the tag enum for the
603
- * intended parent of the parsed fragment. We use just the tag enum rather
604
- * than a full node because that's enough to set all the parsing context we
605
- * need, and it provides some additional flexibility for client code to act as
606
- * if parsing a fragment even when a full HTML tree isn't available.
740
+ * If `NULL` is passed here, it is assumed to be "no
741
+ * fragment", i.e. the regular parsing algorithm. Otherwise, pass the
742
+ * tag name for the intended parent of the parsed fragment. We use the
743
+ * tag name, namespace, and encoding attribute which are sufficient to
744
+ * set all of the parsing context needed for fragment parsing.
607
745
  *
608
- * Default: GUMBO_TAG_LAST
746
+ * Default: `NULL`.
609
747
  */
610
- GumboTag fragment_context;
748
+ const char* fragment_context;
611
749
 
612
750
  /**
613
- * The namespace for the fragment context. This lets client code
614
- * differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
615
- * HTML.
616
- * Default: GUMBO_NAMESPACE_HTML
751
+ * The namespace for the fragment context. This lets client code
752
+ * differentiate between, say, parsing a `<title>` tag in SVG vs.
753
+ * parsing it in HTML.
754
+ *
755
+ * Default: `GUMBO_NAMESPACE_HTML`.
617
756
  */
618
757
  GumboNamespaceEnum fragment_namespace;
758
+
759
+ /**
760
+ * The value of the fragment context's `encoding` attribute, if any.
761
+ * Set to `NULL` for no `encoding` attribute.
762
+ *
763
+ * Default: `NULL`.
764
+ */
765
+ const char* fragment_encoding;
766
+
767
+ /**
768
+ * Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
769
+ * be looked up using `gumbo_compute_quirks_mode()`.
770
+ *
771
+ * Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
772
+ */
773
+ GumboQuirksModeEnum quirks_mode;
774
+
775
+ /**
776
+ * For fragment parsing. Set this to true if the context node has a form
777
+ * element as an ancestor.
778
+ *
779
+ * Default: `false`.
780
+ */
781
+ bool fragment_context_has_form_ancestor;
619
782
  } GumboOptions;
620
783
 
621
784
  /** Default options struct; use this with gumbo_parse_with_options. */
622
785
  extern const GumboOptions kGumboDefaultOptions;
623
786
 
787
+ /**
788
+ * Status code indicating whether parsing finished successfully or
789
+ * was stopped mid-document due to exceptional circumstances.
790
+ */
791
+ typedef enum {
792
+ /**
793
+ * Indicates that parsing completed successfuly. The resulting tree
794
+ * will be a complete document.
795
+ */
796
+ GUMBO_STATUS_OK,
797
+
798
+ /**
799
+ * Indicates that the maximum element nesting limit
800
+ * (`GumboOptions::max_tree_depth`) was reached during parsing. The
801
+ * resulting tree will be a partial document, with no further nodes
802
+ * created after the point where the limit was reached. The partial
803
+ * document may be useful for constructing an error message but
804
+ * typically shouldn't be used for other purposes.
805
+ */
806
+ GUMBO_STATUS_TREE_TOO_DEEP,
807
+
808
+ /**
809
+ * Indicates that the maximum number of attributes per element
810
+ * (`GumboOptions::max_attributes`) was reached during parsing. The
811
+ * resulting tree will be a partial document, with no further nodes
812
+ * created after the point where the limit was reached. The partial
813
+ * document may be useful for constructing an error message but
814
+ * typically shouldn't be used for other purposes.
815
+ */
816
+ GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
817
+
818
+ // Currently unused
819
+ GUMBO_STATUS_OUT_OF_MEMORY,
820
+ } GumboOutputStatus;
821
+
822
+
624
823
  /** The output struct containing the results of the parse. */
625
824
  typedef struct GumboInternalOutput {
626
825
  /**
627
- * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
628
- * that contains the entire document as its child.
826
+ * Pointer to the document node. This is a `GumboNode` of type
827
+ * `NODE_DOCUMENT` that contains the entire document as its child.
629
828
  */
630
829
  GumboNode* document;
631
830
 
632
831
  /**
633
- * Pointer to the root node. This the <html> tag that forms the root of the
634
- * document.
832
+ * Pointer to the root node. This is the `<html>` tag that forms the
833
+ * root of the document.
635
834
  */
636
835
  GumboNode* root;
637
836
 
638
837
  /**
639
838
  * A list of errors that occurred during the parse.
640
- * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
641
- * fleshed out and may change in the future. For this reason, the GumboError
642
- * header isn't part of the public API. Contact us if you need errors
643
- * reported so we can work out something appropriate for your use-case.
644
839
  */
645
840
  GumboVector /* GumboError */ errors;
841
+
842
+ /**
843
+ * True if the parser encounted an error.
844
+ *
845
+ * This can be true and `errors` an empty `GumboVector` if the `max_errors`
846
+ * option was set to 0.
847
+ */
848
+ bool document_error;
849
+
850
+ /**
851
+ * A status code indicating whether parsing finished successfully or was
852
+ * stopped mid-document due to exceptional circumstances.
853
+ */
854
+ GumboOutputStatus status;
646
855
  } GumboOutput;
647
856
 
648
857
  /**
649
- * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
650
- * live at least as long as the parse tree, as some fields (eg. original_text)
651
- * point directly into the original buffer.
858
+ * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
859
+ * buffer must live at least as long as the parse tree, as some fields
860
+ * (eg. `original_text`) point directly into the original buffer.
652
861
  *
653
862
  * This doesn't support buffers longer than 4 gigabytes.
654
863
  */
655
864
  GumboOutput* gumbo_parse(const char* buffer);
656
865
 
657
866
  /**
658
- * Extended version of gumbo_parse that takes an explicit options structure,
659
- * buffer, and length.
867
+ * Extended version of `gumbo_parse` that takes an explicit options
868
+ * structure, buffer, and length.
869
+ */
870
+ GumboOutput* gumbo_parse_with_options (
871
+ const GumboOptions* options,
872
+ const char* buffer,
873
+ size_t buffer_length
874
+ );
875
+
876
+ /**
877
+ * Compute the quirks mode based on the name, public identifier, and system
878
+ * identifier. Any of these may be `NULL` to indicate a missing value.
879
+ */
880
+ GumboQuirksModeEnum gumbo_compute_quirks_mode (
881
+ const char *name,
882
+ const char *pubid,
883
+ const char *sysid
884
+ );
885
+
886
+ /** Convert a `GumboOutputStatus` code into a readable description. */
887
+ const char* gumbo_status_to_string(GumboOutputStatus status);
888
+
889
+ /** Release the memory used for the parse tree and parse errors. */
890
+ void gumbo_destroy_output(GumboOutput* output);
891
+
892
+ /** Opaque GumboError type */
893
+ typedef struct GumboInternalError GumboError;
894
+
895
+ /**
896
+ * Returns the position of the error.
660
897
  */
661
- GumboOutput* gumbo_parse_with_options(
662
- const GumboOptions* options, const char* buffer, size_t buffer_length);
898
+ GumboSourcePosition gumbo_error_position(const GumboError* error);
663
899
 
664
- /** Release the memory used for the parse tree & parse errors. */
665
- void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);
900
+ /**
901
+ * Returns a constant string representation of the error's code. This is owned
902
+ * by the library and should not be freed by the caller.
903
+ */
904
+ const char* gumbo_error_code(const GumboError* error);
905
+
906
+ /**
907
+ * Prints an error to a string. This stores a freshly-allocated buffer
908
+ * containing the error message text in output. The caller is responsible for
909
+ * freeing the buffer. The size of the error message is returned. The error
910
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
911
+ * returned size must be used.
912
+ */
913
+ size_t gumbo_error_to_string(const GumboError* error, char **output);
914
+
915
+ /**
916
+ * Prints a caret diagnostic to a string. This stores a freshly-allocated
917
+ * buffer containing the error message text in output. The caller is responsible for
918
+ * freeing the buffer. The size of the error message is returned. The error
919
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
920
+ * returned size must be used.
921
+ */
922
+ size_t gumbo_caret_diagnostic_to_string (
923
+ const GumboError* error,
924
+ const char* source_text,
925
+ size_t source_length,
926
+ char** output
927
+ );
928
+
929
+ /**
930
+ * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
931
+ * instead of writing to a string.
932
+ */
933
+ void gumbo_print_caret_diagnostic (
934
+ const GumboError* error,
935
+ const char* source_text,
936
+ size_t source_length
937
+ );
666
938
 
667
939
  #ifdef __cplusplus
668
940
  }
669
941
  #endif
670
942
 
671
- #endif // GUMBO_GUMBO_H_
943
+ #endif // GUMBO_H