nokogumbo 1.5.0 → 2.0.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +237 -26
  3. data/ext/nokogumbo/extconf.rb +144 -0
  4. data/ext/nokogumbo/nokogumbo.c +793 -0
  5. data/gumbo-parser/src/ascii.c +75 -0
  6. data/gumbo-parser/src/ascii.h +115 -0
  7. data/gumbo-parser/src/attribute.c +26 -28
  8. data/gumbo-parser/src/attribute.h +3 -23
  9. data/gumbo-parser/src/char_ref.c +5972 -6816
  10. data/gumbo-parser/src/char_ref.h +14 -45
  11. data/gumbo-parser/src/error.c +510 -163
  12. data/gumbo-parser/src/error.h +70 -147
  13. data/gumbo-parser/src/foreign_attrs.c +104 -0
  14. data/gumbo-parser/src/gumbo.h +577 -305
  15. data/gumbo-parser/src/insertion_mode.h +4 -28
  16. data/gumbo-parser/src/macros.h +91 -0
  17. data/gumbo-parser/src/parser.c +2922 -2228
  18. data/gumbo-parser/src/parser.h +6 -22
  19. data/gumbo-parser/src/replacement.h +33 -0
  20. data/gumbo-parser/src/string_buffer.c +43 -50
  21. data/gumbo-parser/src/string_buffer.h +24 -40
  22. data/gumbo-parser/src/string_piece.c +39 -39
  23. data/gumbo-parser/src/svg_attrs.c +174 -0
  24. data/gumbo-parser/src/svg_tags.c +137 -0
  25. data/gumbo-parser/src/tag.c +186 -59
  26. data/gumbo-parser/src/tag_lookup.c +382 -0
  27. data/gumbo-parser/src/tag_lookup.h +13 -0
  28. data/gumbo-parser/src/token_buffer.c +79 -0
  29. data/gumbo-parser/src/token_buffer.h +71 -0
  30. data/gumbo-parser/src/token_type.h +1 -25
  31. data/gumbo-parser/src/tokenizer.c +2128 -1562
  32. data/gumbo-parser/src/tokenizer.h +41 -52
  33. data/gumbo-parser/src/tokenizer_states.h +281 -45
  34. data/gumbo-parser/src/utf8.c +98 -123
  35. data/gumbo-parser/src/utf8.h +84 -52
  36. data/gumbo-parser/src/util.c +48 -38
  37. data/gumbo-parser/src/util.h +10 -40
  38. data/gumbo-parser/src/vector.c +45 -57
  39. data/gumbo-parser/src/vector.h +17 -39
  40. data/lib/nokogumbo.rb +18 -170
  41. data/lib/nokogumbo/html5.rb +252 -0
  42. data/lib/nokogumbo/html5/document.rb +53 -0
  43. data/lib/nokogumbo/html5/document_fragment.rb +62 -0
  44. data/lib/nokogumbo/html5/node.rb +72 -0
  45. data/lib/nokogumbo/version.rb +3 -0
  46. metadata +40 -21
  47. data/ext/nokogumboc/extconf.rb +0 -60
  48. data/ext/nokogumboc/nokogumbo.c +0 -295
  49. data/gumbo-parser/src/char_ref.rl +0 -2554
  50. data/gumbo-parser/src/string_piece.h +0 -38
  51. data/gumbo-parser/src/tag.in +0 -150
  52. data/gumbo-parser/src/tag_enum.h +0 -153
  53. data/gumbo-parser/src/tag_gperf.h +0 -105
  54. data/gumbo-parser/src/tag_sizes.h +0 -4
  55. data/gumbo-parser/src/tag_strings.h +0 -153
  56. data/gumbo-parser/visualc/include/strings.h +0 -4
  57. data/test-nokogumbo.rb +0 -190
@@ -1,32 +1,13 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Error types, enums, and handling functions.
18
-
19
1
  #ifndef GUMBO_ERROR_H_
20
2
  #define GUMBO_ERROR_H_
21
- #ifdef _MSC_VER
22
- #define _CRT_SECURE_NO_WARNINGS
23
- #endif
3
+
24
4
  #include <stdint.h>
25
5
 
26
6
  #include "gumbo.h"
27
7
  #include "insertion_mode.h"
28
8
  #include "string_buffer.h"
29
9
  #include "token_type.h"
10
+ #include "tokenizer_states.h"
30
11
 
31
12
  #ifdef __cplusplus
32
13
  extern "C" {
@@ -35,84 +16,66 @@ extern "C" {
35
16
  struct GumboInternalParser;
36
17
 
37
18
  typedef enum {
19
+ // Defined errors.
20
+ // https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
21
+ GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
22
+ GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
23
+ GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
24
+ GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
25
+ GUMBO_ERR_CDATA_IN_HTML_CONTENT,
26
+ GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
27
+ GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
28
+ GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
29
+ GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
30
+ GUMBO_ERR_DUPLICATE_ATTRIBUTE,
31
+ GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
32
+ GUMBO_ERR_EOF_BEFORE_TAG_NAME,
33
+ GUMBO_ERR_EOF_IN_CDATA,
34
+ GUMBO_ERR_EOF_IN_COMMENT,
35
+ GUMBO_ERR_EOF_IN_DOCTYPE,
36
+ GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
37
+ GUMBO_ERR_EOF_IN_TAG,
38
+ GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
39
+ GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
40
+ GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
41
+ GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
42
+ GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
43
+ GUMBO_ERR_MISSING_DOCTYPE_NAME,
44
+ GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
45
+ GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
46
+ GUMBO_ERR_MISSING_END_TAG_NAME,
47
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
48
+ GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
49
+ GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
50
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
51
+ GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
52
+ GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
53
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
54
+ GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
55
+ GUMBO_ERR_NESTED_COMMENT,
56
+ GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
57
+ GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
58
+ GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
59
+ GUMBO_ERR_NULL_CHARACTER_REFERENCE,
60
+ GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
61
+ GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
62
+ GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
63
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
64
+ GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
65
+ GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
66
+ GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
67
+ GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
68
+ GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
69
+ GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
70
+
71
+ // Encoding errors.
38
72
  GUMBO_ERR_UTF8_INVALID,
39
73
  GUMBO_ERR_UTF8_TRUNCATED,
40
- GUMBO_ERR_UTF8_NULL,
41
- GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
42
- GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
43
- GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
44
- GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
45
- GUMBO_ERR_NAMED_CHAR_REF_INVALID,
46
- GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
47
- GUMBO_ERR_TAG_EOF,
48
- GUMBO_ERR_TAG_INVALID,
49
- GUMBO_ERR_CLOSE_TAG_EMPTY,
50
- GUMBO_ERR_CLOSE_TAG_EOF,
51
- GUMBO_ERR_CLOSE_TAG_INVALID,
52
- GUMBO_ERR_SCRIPT_EOF,
53
- GUMBO_ERR_ATTR_NAME_EOF,
54
- GUMBO_ERR_ATTR_NAME_INVALID,
55
- GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
56
- GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
57
- GUMBO_ERR_ATTR_UNQUOTED_EOF,
58
- GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
59
- GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
60
- GUMBO_ERR_ATTR_AFTER_EOF,
61
- GUMBO_ERR_ATTR_AFTER_INVALID,
62
- GUMBO_ERR_DUPLICATE_ATTR,
63
- GUMBO_ERR_SOLIDUS_EOF,
64
- GUMBO_ERR_SOLIDUS_INVALID,
65
- GUMBO_ERR_DASHES_OR_DOCTYPE,
66
- GUMBO_ERR_COMMENT_EOF,
67
- GUMBO_ERR_COMMENT_INVALID,
68
- GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
69
- GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
70
- GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
71
- GUMBO_ERR_COMMENT_END_BANG_EOF,
72
- GUMBO_ERR_DOCTYPE_EOF,
73
- GUMBO_ERR_DOCTYPE_INVALID,
74
- GUMBO_ERR_DOCTYPE_SPACE,
75
- GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
76
- GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
77
- GUMBO_ERR_DOCTYPE_END,
74
+
75
+ // Generic parser error.
78
76
  GUMBO_ERR_PARSER,
79
- GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
80
77
  } GumboErrorType;
81
78
 
82
- // Additional data for duplicated attributes.
83
- typedef struct GumboInternalDuplicateAttrError {
84
- // The name of the attribute. Owned by this struct.
85
- const char* name;
86
-
87
- // The (0-based) index within the attributes vector of the original
88
- // occurrence.
89
- unsigned int original_index;
90
-
91
- // The (0-based) index where the new occurrence would be.
92
- unsigned int new_index;
93
- } GumboDuplicateAttrError;
94
-
95
- // A simplified representation of the tokenizer state, designed to be more
96
- // useful to clients of this library than the internal representation. This
97
- // condenses the actual states used in the tokenizer state machine into a few
98
- // values that will be familiar to users of HTML.
99
- typedef enum {
100
- GUMBO_ERR_TOKENIZER_DATA,
101
- GUMBO_ERR_TOKENIZER_CHAR_REF,
102
- GUMBO_ERR_TOKENIZER_RCDATA,
103
- GUMBO_ERR_TOKENIZER_RAWTEXT,
104
- GUMBO_ERR_TOKENIZER_PLAINTEXT,
105
- GUMBO_ERR_TOKENIZER_SCRIPT,
106
- GUMBO_ERR_TOKENIZER_TAG,
107
- GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
108
- GUMBO_ERR_TOKENIZER_ATTR_NAME,
109
- GUMBO_ERR_TOKENIZER_ATTR_VALUE,
110
- GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
111
- GUMBO_ERR_TOKENIZER_COMMENT,
112
- GUMBO_ERR_TOKENIZER_DOCTYPE,
113
- GUMBO_ERR_TOKENIZER_CDATA,
114
- } GumboTokenizerErrorState;
115
-
116
79
  // Additional data for tokenizer errors.
117
80
  // This records the current state and codepoint encountered - this is usually
118
81
  // enough to reconstruct what went wrong and provide a friendly error message.
@@ -121,7 +84,7 @@ typedef struct GumboInternalTokenizerError {
121
84
  int codepoint;
122
85
 
123
86
  // The state that the tokenizer was in at the time.
124
- GumboTokenizerErrorState state;
87
+ GumboTokenizerEnum state;
125
88
  } GumboTokenizerError;
126
89
 
127
90
  // Additional data for parse errors.
@@ -129,61 +92,43 @@ typedef struct GumboInternalParserError {
129
92
  // The type of input token that resulted in this error.
130
93
  GumboTokenType input_type;
131
94
 
132
- // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
95
+ // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
133
96
  GumboTag input_tag;
134
97
 
135
98
  // The insertion mode that the parser was in at the time.
136
99
  GumboInsertionMode parser_state;
137
100
 
138
- // The tag stack at the point of the error. Note that this is an GumboVector
101
+ // The tag stack at the point of the error. Note that this is an GumboVector
139
102
  // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
140
103
  // get at the tag.
141
104
  GumboVector /* GumboTag */ tag_stack;
142
105
  } GumboParserError;
143
106
 
144
107
  // The overall error struct representing an error in decoding/tokenizing/parsing
145
- // the HTML. This contains an enumerated type flag, a source position, and then
108
+ // the HTML. This contains an enumerated type flag, a source position, and then
146
109
  // a union of fields containing data specific to the error.
147
- typedef struct GumboInternalError {
110
+ struct GumboInternalError {
148
111
  // The type of error.
149
112
  GumboErrorType type;
150
113
 
151
114
  // The position within the source file where the error occurred.
152
115
  GumboSourcePosition position;
153
116
 
154
- // A pointer to the byte within the original source file text where the error
155
- // occurred (note that this is not the same as position.offset, as that gives
156
- // character-based instead of byte-based offsets).
157
- const char* original_text;
117
+ // The piece of text that caused the error.
118
+ GumboStringPiece original_text;
158
119
 
159
120
  // Type-specific error information.
160
121
  union {
161
- // The code point we encountered, for:
162
- // * GUMBO_ERR_UTF8_INVALID
163
- // * GUMBO_ERR_UTF8_TRUNCATED
164
- // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
165
- // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
166
- uint64_t codepoint;
167
-
168
122
  // Tokenizer errors.
169
123
  GumboTokenizerError tokenizer;
170
124
 
171
- // Short textual data, for:
172
- // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
173
- // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
174
- GumboStringPiece text;
175
-
176
- // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
177
- GumboDuplicateAttrError duplicate_attr;
178
-
179
- // Parser state, for GUMBO_ERR_PARSER and
180
- // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
181
- struct GumboInternalParserError parser;
125
+ // Parser errors.
126
+ GumboParserError parser;
182
127
  } v;
183
- } GumboError;
128
+ };
184
129
 
185
130
  // Adds a new error to the parser's error list, and returns a pointer to it so
186
- // that clients can fill out the rest of its fields. May return NULL if we're
131
+ // that clients can fill out the rest of its fields. May return NULL if we're
187
132
  // already over the max_errors field specified in GumboOptions.
188
133
  GumboError* gumbo_add_error(struct GumboInternalParser* parser);
189
134
 
@@ -194,32 +139,10 @@ void gumbo_init_errors(struct GumboInternalParser* errors);
194
139
  void gumbo_destroy_errors(struct GumboInternalParser* errors);
195
140
 
196
141
  // Frees the memory used for a single GumboError.
197
- void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
198
-
199
- // Prints an error to a string. This fills an empty GumboStringBuffer with a
200
- // freshly-allocated buffer containing the error message text. The caller is
201
- // responsible for deleting the buffer. (Note that the buffer is allocated with
202
- // the allocator specified in the GumboParser config and hence should be freed
203
- // by gumbo_parser_deallocate().)
204
- void gumbo_error_to_string(struct GumboInternalParser* parser,
205
- const GumboError* error, GumboStringBuffer* output);
206
-
207
- // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
208
- // with a freshly-allocated buffer containing the error message text. The
209
- // caller is responsible for deleting the buffer. (Note that the buffer is
210
- // allocated with the allocator specified in the GumboParser config and hence
211
- // should be freed by gumbo_parser_deallocate().)
212
- void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
213
- const GumboError* error, const char* source_text,
214
- GumboStringBuffer* output);
215
-
216
- // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
217
- // of writing to a string.
218
- void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
219
- const GumboError* error, const char* source_text);
142
+ void gumbo_error_destroy(GumboError* error);
220
143
 
221
144
  #ifdef __cplusplus
222
145
  }
223
146
  #endif
224
147
 
225
- #endif // GUMBO_ERROR_H_
148
+ #endif // GUMBO_ERROR_H_
@@ -0,0 +1,104 @@
1
+ /* ANSI-C code produced by gperf version 3.1 */
2
+ /* Command-line: gperf -m100 -n lib/foreign_attrs.gperf */
3
+ /* Computed positions: -k'2,8' */
4
+ /* Filtered by: mk/gperf-filter.sed */
5
+
6
+ #include "replacement.h"
7
+ #include "macros.h"
8
+ #include <string.h>
9
+
10
+ #define TOTAL_KEYWORDS 11
11
+ #define MIN_WORD_LENGTH 5
12
+ #define MAX_WORD_LENGTH 13
13
+ #define MIN_HASH_VALUE 0
14
+ #define MAX_HASH_VALUE 10
15
+ /* maximum key range = 11, duplicates = 0 */
16
+
17
+ static inline unsigned int
18
+ hash (register const char *str, register size_t len)
19
+ {
20
+ static const unsigned char asso_values[] =
21
+ {
22
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
23
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
24
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
25
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
26
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
27
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
28
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
29
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
30
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
31
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 2,
32
+ 11, 10, 11, 9, 7, 6, 11, 11, 1, 0,
33
+ 11, 5, 11, 11, 4, 11, 11, 11, 11, 11,
34
+ 11, 3, 11, 11, 11, 11, 11, 11, 11, 11,
35
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
36
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
37
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
38
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
39
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
40
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
41
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
42
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
43
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
44
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
45
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
46
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
47
+ 11, 11, 11, 11, 11, 11
48
+ };
49
+ register unsigned int hval = 0;
50
+
51
+ switch (len)
52
+ {
53
+ default:
54
+ hval += asso_values[(unsigned char)str[7]];
55
+ /*FALLTHROUGH*/
56
+ case 7:
57
+ case 6:
58
+ case 5:
59
+ case 4:
60
+ case 3:
61
+ case 2:
62
+ hval += asso_values[(unsigned char)str[1]];
63
+ break;
64
+ }
65
+ return hval;
66
+ }
67
+
68
+ const ForeignAttrReplacement *
69
+ gumbo_get_foreign_attr_replacement (register const char *str, register size_t len)
70
+ {
71
+ static const unsigned char lengthtable[] =
72
+ {
73
+ 5, 11, 9, 13, 10, 10, 10, 11, 10, 8, 8
74
+ };
75
+ static const ForeignAttrReplacement wordlist[] =
76
+ {
77
+ {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
78
+ {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
79
+ {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
80
+ {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
81
+ {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
82
+ {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
83
+ {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
84
+ {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
85
+ {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
86
+ {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
87
+ {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}
88
+ };
89
+
90
+ if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
91
+ {
92
+ register unsigned int key = hash (str, len);
93
+
94
+ if (key <= MAX_HASH_VALUE)
95
+ if (len == lengthtable[key])
96
+ {
97
+ register const char *s = wordlist[key].from;
98
+
99
+ if (s && *str == *s && !memcmp (str + 1, s + 1, len - 1))
100
+ return &wordlist[key];
101
+ }
102
+ }
103
+ return 0;
104
+ }
@@ -1,51 +1,33 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18
- // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19
- // kGumbo prefix).
1
+ // Copyright 2010 Google Inc.
2
+ // Copyright 2018 Craig Barnes.
3
+ // Licensed under the Apache License, version 2.0.
4
+
5
+ // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
6
+ // GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
7
+ // static constants
20
8
 
21
9
  /**
22
10
  * @file
23
11
  * @mainpage Gumbo HTML Parser
24
12
  *
25
- * This provides a conformant, no-dependencies implementation of the HTML5
26
- * parsing algorithm. It supports only UTF8; if you need to parse a different
27
- * encoding, run a preprocessing step to convert to UTF8. It returns a parse
28
- * tree made of the structs in this file.
13
+ * This provides a conformant, no-dependencies implementation of the
14
+ * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
15
+ * to parse a different encoding, run a preprocessing step to convert
16
+ * to UTF-8. It returns a parse tree made of the structs in this file.
29
17
  *
30
18
  * Example:
31
19
  * @code
32
20
  * GumboOutput* output = gumbo_parse(input);
33
21
  * do_something_with_doctype(output->document);
34
22
  * do_something_with_html_tree(output->root);
35
- * gumbo_destroy_output(&options, output);
23
+ * gumbo_destroy_output(output);
36
24
  * @endcode
37
- * HTML5 Spec:
38
25
  *
39
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
26
+ * [HTML5]: https://html.spec.whatwg.org/multipage/
40
27
  */
41
28
 
42
- #ifndef GUMBO_GUMBO_H_
43
- #define GUMBO_GUMBO_H_
44
-
45
- #ifdef _MSC_VER
46
- #define _CRT_SECURE_NO_WARNINGS
47
- #define fileno _fileno
48
- #endif
29
+ #ifndef GUMBO_H
30
+ #define GUMBO_H
49
31
 
50
32
  #include <stdbool.h>
51
33
  #include <stddef.h>
@@ -55,73 +37,77 @@ extern "C" {
55
37
  #endif
56
38
 
57
39
  /**
58
- * A struct representing a character position within the original text buffer.
59
- * Line and column numbers are 1-based and offsets are 0-based, which matches
60
- * how most editors and command-line tools work. Also, columns measure
61
- * positions in terms of characters while offsets measure by bytes; this is
62
- * because the offset field is often used to pull out a particular region of
63
- * text (which in most languages that bind to C implies pointer arithmetic on a
64
- * buffer of bytes), while the column field is often used to reference a
65
- * particular column on a printable display, which nowadays is usually UTF-8.
40
+ * A struct representing a character position within the original text
41
+ * buffer. Line and column numbers are 1-based and offsets are 0-based,
42
+ * which matches how most editors and command-line tools work.
66
43
  */
67
44
  typedef struct {
68
- unsigned int line;
69
- unsigned int column;
70
- unsigned int offset;
45
+ size_t line;
46
+ size_t column;
47
+ size_t offset;
71
48
  } GumboSourcePosition;
72
49
 
73
50
  /**
74
- * A SourcePosition used for elements that have no source position, i.e.
75
- * parser-inserted elements.
76
- */
77
- extern const GumboSourcePosition kGumboEmptySourcePosition;
78
-
79
- /**
80
- * A struct representing a string or part of a string. Strings within the
81
- * parser are represented by a char* and a length; the char* points into
82
- * an existing data buffer owned by some other code (often the original input).
83
- * GumboStringPieces are assumed (by convention) to be immutable, because they
84
- * may share data. Use GumboStringBuffer if you need to construct a string.
85
- * Clients should assume that it is not NUL-terminated, and should always use
86
- * explicit lengths when manipulating them.
51
+ * A struct representing a string or part of a string. Strings within
52
+ * the parser are represented by a `char*` and a length; the `char*`
53
+ * points into an existing data buffer owned by some other code (often
54
+ * the original input). `GumboStringPiece`s are assumed (by convention)
55
+ * to be immutable, because they may share data. Clients should assume
56
+ * that it is not NUL-terminated and should always use explicit lengths
57
+ * when manipulating them.
87
58
  */
88
59
  typedef struct {
89
- /** A pointer to the beginning of the string. NULL iff length == 0. */
60
+ /** A pointer to the beginning of the string. `NULL` if `length == 0`. */
90
61
  const char* data;
91
62
 
92
- /** The length of the string fragment, in bytes. May be zero. */
63
+ /** The length of the string fragment, in bytes (may be zero). */
93
64
  size_t length;
94
65
  } GumboStringPiece;
95
66
 
67
+ #define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
96
68
  /** A constant to represent a 0-length null string. */
97
- extern const GumboStringPiece kGumboEmptyString;
69
+ #define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
98
70
 
99
71
  /**
100
- * Compares two GumboStringPieces, and returns true if they're equal or false
101
- * otherwise.
72
+ * Compares two `GumboStringPiece`s, and returns `true` if they're
73
+ * equal or `false` otherwise.
102
74
  */
103
- bool gumbo_string_equals(
104
- const GumboStringPiece* str1, const GumboStringPiece* str2);
75
+ bool gumbo_string_equals (
76
+ const GumboStringPiece* str1,
77
+ const GumboStringPiece* str2
78
+ );
105
79
 
106
80
  /**
107
- * Compares two GumboStringPieces ignoring case, and returns true if they're
108
- * equal or false otherwise.
81
+ * Compares two `GumboStringPiece`s, ignoring case, and returns `true`
82
+ * if they're equal or `false` otherwise.
109
83
  */
110
- bool gumbo_string_equals_ignore_case(
111
- const GumboStringPiece* str1, const GumboStringPiece* str2);
84
+ bool gumbo_string_equals_ignore_case (
85
+ const GumboStringPiece* str1,
86
+ const GumboStringPiece* str2
87
+ );
112
88
 
113
89
  /**
114
- * A simple vector implementation. This stores a pointer to a data array and a
115
- * length. All elements are stored as void*; client code must cast to the
116
- * appropriate type. Overflows upon addition result in reallocation of the data
117
- * array, with the size doubling to maintain O(1) amortized cost. There is no
118
- * removal function, as this isn't needed for any of the operations within this
119
- * library. Iteration can be done through inspecting the structure directly in
120
- * a for-loop.
90
+ * Check if the first `GumboStringPiece` is a prefix of the second, ignoring
91
+ * case.
92
+ */
93
+ bool gumbo_string_prefix_ignore_case (
94
+ const GumboStringPiece* prefix,
95
+ const GumboStringPiece* str
96
+ );
97
+
98
+ /**
99
+ * A simple vector implementation. This stores a pointer to a data array
100
+ * and a length. All elements are stored as `void*`; client code must
101
+ * cast to the appropriate type. Overflows upon addition result in
102
+ * reallocation of the data array, with the size doubling to maintain
103
+ * `O(1)` amortized cost. There is no removal function, as this isn't
104
+ * needed for any of the operations within this library. Iteration can
105
+ * be done through inspecting the structure directly in a `for` loop.
121
106
  */
122
107
  typedef struct {
123
- /** Data elements. This points to a dynamically-allocated array of capacity
124
- * elements, each a void* to the element itself.
108
+ /**
109
+ * Data elements. This points to a dynamically-allocated array of
110
+ * `capacity` elements, each a `void*` to the element itself.
125
111
  */
126
112
  void** data;
127
113
 
@@ -132,82 +118,230 @@ typedef struct {
132
118
  unsigned int capacity;
133
119
  } GumboVector;
134
120
 
135
- /** An empty (0-length, 0-capacity) GumboVector. */
136
- extern const GumboVector kGumboEmptyVector;
121
+ # define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
122
+ /** An empty (0-length, 0-capacity) `GumboVector`. */
123
+ #define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
137
124
 
138
125
  /**
139
- * Returns the first index at which an element appears in this vector (testing
140
- * by pointer equality), or -1 if it never does.
126
+ * Returns the first index at which an element appears in this vector
127
+ * (testing by pointer equality), or `-1` if it never does.
141
128
  */
142
129
  int gumbo_vector_index_of(GumboVector* vector, const void* element);
143
130
 
144
131
  /**
145
- * An enum for all the tags defined in the HTML5 standard. These correspond to
146
- * the tag names themselves. Enum constants exist only for tags which appear in
147
- * the spec itself (or for tags with special handling in the SVG and MathML
148
- * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
149
- * name can be obtained through original_tag.
132
+ * An `enum` for all the tags defined in the HTML5 standard. These
133
+ * correspond to the tag names themselves. Enum constants exist only
134
+ * for tags that appear in the spec itself (or for tags with special
135
+ * handling in the SVG and MathML namespaces). Any other tags appear
136
+ * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
137
+ * through `original_tag`.
150
138
  *
151
- * This is mostly for API convenience, so that clients of this library don't
152
- * need to perform a strcasecmp to find the normalized tag name. It also has
153
- * efficiency benefits, by letting the parser work with enums instead of
154
- * strings.
139
+ * This is mostly for API convenience, so that clients of this library
140
+ * don't need to perform a `strcasecmp` to find the normalized tag
141
+ * name. It also has efficiency benefits, by letting the parser work
142
+ * with enums instead of strings.
155
143
  */
156
144
  typedef enum {
157
- // Load all the tags from an external source, generated from tag.in.
158
- #include "tag_enum.h"
159
- // Used for all tags that don't have special handling in HTML. Add new tags
160
- // to the end of tag.in so as to preserve backwards-compatibility.
145
+ GUMBO_TAG_HTML,
146
+ GUMBO_TAG_HEAD,
147
+ GUMBO_TAG_TITLE,
148
+ GUMBO_TAG_BASE,
149
+ GUMBO_TAG_LINK,
150
+ GUMBO_TAG_META,
151
+ GUMBO_TAG_STYLE,
152
+ GUMBO_TAG_SCRIPT,
153
+ GUMBO_TAG_NOSCRIPT,
154
+ GUMBO_TAG_TEMPLATE,
155
+ GUMBO_TAG_BODY,
156
+ GUMBO_TAG_ARTICLE,
157
+ GUMBO_TAG_SECTION,
158
+ GUMBO_TAG_NAV,
159
+ GUMBO_TAG_ASIDE,
160
+ GUMBO_TAG_H1,
161
+ GUMBO_TAG_H2,
162
+ GUMBO_TAG_H3,
163
+ GUMBO_TAG_H4,
164
+ GUMBO_TAG_H5,
165
+ GUMBO_TAG_H6,
166
+ GUMBO_TAG_HGROUP,
167
+ GUMBO_TAG_HEADER,
168
+ GUMBO_TAG_FOOTER,
169
+ GUMBO_TAG_ADDRESS,
170
+ GUMBO_TAG_P,
171
+ GUMBO_TAG_HR,
172
+ GUMBO_TAG_PRE,
173
+ GUMBO_TAG_BLOCKQUOTE,
174
+ GUMBO_TAG_OL,
175
+ GUMBO_TAG_UL,
176
+ GUMBO_TAG_LI,
177
+ GUMBO_TAG_DL,
178
+ GUMBO_TAG_DT,
179
+ GUMBO_TAG_DD,
180
+ GUMBO_TAG_FIGURE,
181
+ GUMBO_TAG_FIGCAPTION,
182
+ GUMBO_TAG_MAIN,
183
+ GUMBO_TAG_DIV,
184
+ GUMBO_TAG_A,
185
+ GUMBO_TAG_EM,
186
+ GUMBO_TAG_STRONG,
187
+ GUMBO_TAG_SMALL,
188
+ GUMBO_TAG_S,
189
+ GUMBO_TAG_CITE,
190
+ GUMBO_TAG_Q,
191
+ GUMBO_TAG_DFN,
192
+ GUMBO_TAG_ABBR,
193
+ GUMBO_TAG_DATA,
194
+ GUMBO_TAG_TIME,
195
+ GUMBO_TAG_CODE,
196
+ GUMBO_TAG_VAR,
197
+ GUMBO_TAG_SAMP,
198
+ GUMBO_TAG_KBD,
199
+ GUMBO_TAG_SUB,
200
+ GUMBO_TAG_SUP,
201
+ GUMBO_TAG_I,
202
+ GUMBO_TAG_B,
203
+ GUMBO_TAG_U,
204
+ GUMBO_TAG_MARK,
205
+ GUMBO_TAG_RUBY,
206
+ GUMBO_TAG_RT,
207
+ GUMBO_TAG_RP,
208
+ GUMBO_TAG_BDI,
209
+ GUMBO_TAG_BDO,
210
+ GUMBO_TAG_SPAN,
211
+ GUMBO_TAG_BR,
212
+ GUMBO_TAG_WBR,
213
+ GUMBO_TAG_INS,
214
+ GUMBO_TAG_DEL,
215
+ GUMBO_TAG_IMAGE,
216
+ GUMBO_TAG_IMG,
217
+ GUMBO_TAG_IFRAME,
218
+ GUMBO_TAG_EMBED,
219
+ GUMBO_TAG_OBJECT,
220
+ GUMBO_TAG_PARAM,
221
+ GUMBO_TAG_VIDEO,
222
+ GUMBO_TAG_AUDIO,
223
+ GUMBO_TAG_SOURCE,
224
+ GUMBO_TAG_TRACK,
225
+ GUMBO_TAG_CANVAS,
226
+ GUMBO_TAG_MAP,
227
+ GUMBO_TAG_AREA,
228
+ GUMBO_TAG_MATH,
229
+ GUMBO_TAG_MI,
230
+ GUMBO_TAG_MO,
231
+ GUMBO_TAG_MN,
232
+ GUMBO_TAG_MS,
233
+ GUMBO_TAG_MTEXT,
234
+ GUMBO_TAG_MGLYPH,
235
+ GUMBO_TAG_MALIGNMARK,
236
+ GUMBO_TAG_ANNOTATION_XML,
237
+ GUMBO_TAG_SVG,
238
+ GUMBO_TAG_FOREIGNOBJECT,
239
+ GUMBO_TAG_DESC,
240
+ GUMBO_TAG_TABLE,
241
+ GUMBO_TAG_CAPTION,
242
+ GUMBO_TAG_COLGROUP,
243
+ GUMBO_TAG_COL,
244
+ GUMBO_TAG_TBODY,
245
+ GUMBO_TAG_THEAD,
246
+ GUMBO_TAG_TFOOT,
247
+ GUMBO_TAG_TR,
248
+ GUMBO_TAG_TD,
249
+ GUMBO_TAG_TH,
250
+ GUMBO_TAG_FORM,
251
+ GUMBO_TAG_FIELDSET,
252
+ GUMBO_TAG_LEGEND,
253
+ GUMBO_TAG_LABEL,
254
+ GUMBO_TAG_INPUT,
255
+ GUMBO_TAG_BUTTON,
256
+ GUMBO_TAG_SELECT,
257
+ GUMBO_TAG_DATALIST,
258
+ GUMBO_TAG_OPTGROUP,
259
+ GUMBO_TAG_OPTION,
260
+ GUMBO_TAG_TEXTAREA,
261
+ GUMBO_TAG_KEYGEN,
262
+ GUMBO_TAG_OUTPUT,
263
+ GUMBO_TAG_PROGRESS,
264
+ GUMBO_TAG_METER,
265
+ GUMBO_TAG_DETAILS,
266
+ GUMBO_TAG_SUMMARY,
267
+ GUMBO_TAG_MENU,
268
+ GUMBO_TAG_MENUITEM,
269
+ GUMBO_TAG_APPLET,
270
+ GUMBO_TAG_ACRONYM,
271
+ GUMBO_TAG_BGSOUND,
272
+ GUMBO_TAG_DIR,
273
+ GUMBO_TAG_FRAME,
274
+ GUMBO_TAG_FRAMESET,
275
+ GUMBO_TAG_NOFRAMES,
276
+ GUMBO_TAG_LISTING,
277
+ GUMBO_TAG_XMP,
278
+ GUMBO_TAG_NEXTID,
279
+ GUMBO_TAG_NOEMBED,
280
+ GUMBO_TAG_PLAINTEXT,
281
+ GUMBO_TAG_RB,
282
+ GUMBO_TAG_STRIKE,
283
+ GUMBO_TAG_BASEFONT,
284
+ GUMBO_TAG_BIG,
285
+ GUMBO_TAG_BLINK,
286
+ GUMBO_TAG_CENTER,
287
+ GUMBO_TAG_FONT,
288
+ GUMBO_TAG_MARQUEE,
289
+ GUMBO_TAG_MULTICOL,
290
+ GUMBO_TAG_NOBR,
291
+ GUMBO_TAG_SPACER,
292
+ GUMBO_TAG_TT,
293
+ GUMBO_TAG_RTC,
294
+ GUMBO_TAG_DIALOG,
295
+ // Used for all tags that don't have special handling in HTML.
161
296
  GUMBO_TAG_UNKNOWN,
162
297
  // A marker value to indicate the end of the enum, for iterating over it.
163
- // Also used as the terminator for varargs functions that take tags.
164
298
  GUMBO_TAG_LAST,
165
299
  } GumboTag;
166
300
 
167
301
  /**
168
- * Returns the normalized (usually all-lowercased, except for foreign content)
169
- * tag name for an GumboTag enum. Return value is static data owned by the
170
- * library.
302
+ * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
303
+ * return value is static data owned by the library.
171
304
  */
172
305
  const char* gumbo_normalized_tagname(GumboTag tag);
173
306
 
174
307
  /**
175
- * Extracts the tag name from the original_text field of an element or token by
176
- * stripping off </> characters and attributes and adjusting the passed-in
177
- * GumboStringPiece appropriately. The tag name is in the original case and
178
- * shares a buffer with the original text, to simplify memory management.
179
- * Behavior is undefined if a string-piece that doesn't represent an HTML tag
180
- * (<tagname> or </tagname>) is passed in. If the string piece is completely
181
- * empty (NULL data pointer), then this function will exit successfully as a
182
- * no-op.
308
+ * Extracts the tag name from the `original_text` field of an element
309
+ * or token by stripping off `</>` characters and attributes and
310
+ * adjusting the passed-in `GumboStringPiece` appropriately. The tag
311
+ * name is in the original case and shares a buffer with the original
312
+ * text, to simplify memory management. Behavior is undefined if a
313
+ * string piece that doesn't represent an HTML tag (`<tagname>` or
314
+ * `</tagname>`) is passed in. If the string piece is completely
315
+ * empty (`NULL` data pointer), then this function will exit
316
+ * successfully as a no-op.
183
317
  */
184
318
  void gumbo_tag_from_original_text(GumboStringPiece* text);
185
319
 
186
320
  /**
187
- * Fixes the case of SVG elements that are not all lowercase.
188
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
189
- * This is not done at parse time because there's no place to store a mutated
190
- * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
191
- * without special handling), while original_tag_name is a pointer into the
192
- * original buffer. Instead, we provide this helper function that clients can
193
- * use to rename SVG tags as appropriate.
194
- * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
195
- * no normalization is called for. The return value is static data and owned by
196
- * the library.
321
+ * Fixes the case of SVG elements that are not all lowercase. This is
322
+ * not done at parse time because there's no place to store a mutated
323
+ * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
324
+ * SVG tags without special handling), while `original_tag_name` is a
325
+ * pointer into the original buffer. Instead, we provide this helper
326
+ * function that clients can use to rename SVG tags as appropriate.
327
+ * Returns the case-normalized SVG tagname if a replacement is found, or
328
+ * `NULL` if no normalization is called for. The return value is static
329
+ * data and owned by the library.
330
+ *
331
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
197
332
  */
198
333
  const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
199
334
 
200
335
  /**
201
- * Converts a tag name string (which may be in upper or mixed case) to a tag
202
- * enum. The `tag` version expects `tagname` to be NULL-terminated
336
+ * Converts a tag name string (which may be in upper or mixed case) to a
337
+ * tag enum.
203
338
  */
204
- GumboTag gumbo_tag_enum(const char* tagname);
205
- GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
339
+ GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
206
340
 
207
341
  /**
208
342
  * Attribute namespaces.
209
- * HTML includes special handling for XLink, XML, and XMLNS namespaces on
210
- * attributes. Everything else goes in the generic "NONE" namespace.
343
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces
344
+ * on attributes. Everything else goes in the generic "NONE" namespace.
211
345
  */
212
346
  typedef enum {
213
347
  GUMBO_ATTR_NAMESPACE_NONE,
@@ -217,46 +351,47 @@ typedef enum {
217
351
  } GumboAttributeNamespaceEnum;
218
352
 
219
353
  /**
220
- * A struct representing a single attribute on an HTML tag. This is a
221
- * name-value pair, but also includes information about source locations and
222
- * original source text.
354
+ * A struct representing a single attribute on a HTML tag. This is a
355
+ * name-value pair, but also includes information about source locations
356
+ * and original source text.
223
357
  */
224
358
  typedef struct {
225
359
  /**
226
- * The namespace for the attribute. This will usually be
227
- * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
228
- * values, per:
229
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
360
+ * The namespace for the attribute. This will usually be
361
+ * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
362
+ * take special values, per:
363
+ * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
230
364
  */
231
365
  GumboAttributeNamespaceEnum attr_namespace;
232
366
 
233
367
  /**
234
- * The name of the attribute. This is in a freshly-allocated buffer to deal
235
- * with case-normalization, and is null-terminated.
368
+ * The name of the attribute. This is in a freshly-allocated buffer to
369
+ * deal with case-normalization and is null-terminated.
236
370
  */
237
371
  const char* name;
238
372
 
239
373
  /**
240
- * The original text of the attribute name, as a pointer into the original
241
- * source buffer.
374
+ * The original text of the attribute name, as a pointer into the
375
+ * original source buffer.
242
376
  */
243
377
  GumboStringPiece original_name;
244
378
 
245
379
  /**
246
- * The value of the attribute. This is in a freshly-allocated buffer to deal
247
- * with unescaping, and is null-terminated. It does not include any quotes
248
- * that surround the attribute. If the attribute has no value (for example,
249
- * 'selected' on a checkbox), this will be an empty string.
380
+ * The value of the attribute. This is in a freshly-allocated buffer
381
+ * to deal with unescaping and is null-terminated. It does not include
382
+ * any quotes that surround the attribute. If the attribute has no
383
+ * value (for example, `selected` on a checkbox) this will be an empty
384
+ * string.
250
385
  */
251
386
  const char* value;
252
387
 
253
388
  /**
254
- * The original text of the value of the attribute. This points into the
255
- * original source buffer. It includes any quotes that surround the
256
- * attribute, and you can look at original_value.data[0] and
257
- * original_value.data[original_value.length - 1] to determine what the quote
258
- * characters were. If the attribute has no value, this will be a 0-length
259
- * string.
389
+ * The original text of the value of the attribute. This points into
390
+ * the original source buffer. It includes any quotes that surround
391
+ * the attribute and you can look at `original_value.data[0]` and
392
+ * `original_value.data[original_value.length - 1]` to determine what
393
+ * the quote characters were. If the attribute has no value this will
394
+ * be a 0-length string.
260
395
  */
261
396
  GumboStringPiece original_value;
262
397
 
@@ -264,9 +399,9 @@ typedef struct {
264
399
  GumboSourcePosition name_start;
265
400
 
266
401
  /**
267
- * The ending position of the attribute name. This is not always derivable
402
+ * The ending position of the attribute name. This is not always derivable
268
403
  * from the starting position of the value because of the possibility of
269
- * whitespace around the = sign.
404
+ * whitespace around the `=` sign.
270
405
  */
271
406
  GumboSourcePosition name_end;
272
407
 
@@ -278,34 +413,37 @@ typedef struct {
278
413
  } GumboAttribute;
279
414
 
280
415
  /**
281
- * Given a vector of GumboAttributes, look up the one with the specified name
282
- * and return it, or NULL if no such attribute exists. This uses a
283
- * case-insensitive match, as HTML is case-insensitive.
416
+ * Given a vector of `GumboAttribute`s, look up the one with the
417
+ * specified name and return it, or `NULL` if no such attribute exists.
418
+ * This uses a case-insensitive match, as HTML is case-insensitive.
284
419
  */
285
420
  GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
286
421
 
287
422
  /**
288
- * Enum denoting the type of node. This determines the type of the node.v
289
- * union.
423
+ * Enum denoting the type of node. This determines the type of the
424
+ * `node.v` union.
290
425
  */
291
426
  typedef enum {
292
- /** Document node. v will be a GumboDocument. */
427
+ /** Document node. `v` will be a `GumboDocument`. */
293
428
  GUMBO_NODE_DOCUMENT,
294
- /** Element node. v will be a GumboElement. */
429
+ /** Element node. `v` will be a `GumboElement`. */
295
430
  GUMBO_NODE_ELEMENT,
296
- /** Text node. v will be a GumboText. */
431
+ /** Text node. `v` will be a `GumboText`. */
297
432
  GUMBO_NODE_TEXT,
298
- /** CDATA node. v will be a GumboText. */
433
+ /** CDATA node. `v` will be a `GumboText`. */
299
434
  GUMBO_NODE_CDATA,
300
- /** Comment node. v will be a GumboText, excluding comment delimiters. */
435
+ /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
301
436
  GUMBO_NODE_COMMENT,
302
- /** Text node, where all contents is whitespace. v will be a GumboText. */
437
+ /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
303
438
  GUMBO_NODE_WHITESPACE,
304
- /** Template node. This is separate from GUMBO_NODE_ELEMENT because many
305
- * client libraries will want to ignore the contents of template nodes, as
306
- * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
307
- * here, while clients that want to include template contents should also
308
- * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
439
+ /**
440
+ * Template node. This is separate from `GUMBO_NODE_ELEMENT` because
441
+ * many client libraries will want to ignore the contents of template
442
+ * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
443
+ * do the right thing here, while clients that want to include template
444
+ * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
445
+ * `GumboElement`.
446
+ */
309
447
  GUMBO_NODE_TEMPLATE
310
448
  } GumboNodeType;
311
449
 
@@ -315,9 +453,7 @@ typedef enum {
315
453
  */
316
454
  typedef struct GumboInternalNode GumboNode;
317
455
 
318
- /**
319
- * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
320
- */
456
+ /** https://dom.spec.whatwg.org/#concept-document-quirks */
321
457
  typedef enum {
322
458
  GUMBO_DOCTYPE_NO_QUIRKS,
323
459
  GUMBO_DOCTYPE_QUIRKS,
@@ -326,10 +462,11 @@ typedef enum {
326
462
 
327
463
  /**
328
464
  * Namespaces.
329
- * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
330
- * anything inside an <svg> tag is in the SVG namespace, anything inside the
331
- * <math> tag is in the MathML namespace, and anything else is inside the HTML
332
- * namespace. No other namespaces are supported, so this can be an enum only.
465
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
466
+ * Rather, anything inside an `<svg>` tag is in the SVG namespace,
467
+ * anything inside the `<math>` tag is in the MathML namespace, and
468
+ * anything else is inside the HTML namespace. No other namespaces are
469
+ * supported, so this can be an `enum`.
333
470
  */
334
471
  typedef enum {
335
472
  GUMBO_NAMESPACE_HTML,
@@ -339,66 +476,70 @@ typedef enum {
339
476
 
340
477
  /**
341
478
  * Parse flags.
342
- * We track the reasons for parser insertion of nodes and store them in a
343
- * bitvector in the node itself. This lets client code optimize out nodes that
344
- * are implied by the HTML structure of the document, or flag constructs that
345
- * may not be allowed by a style guide, or track the prevalence of incorrect or
346
- * tricky HTML code.
479
+ * We track the reasons for parser insertion of nodes and store them in
480
+ * a bitvector in the node itself. This lets client code optimize out
481
+ * nodes that are implied by the HTML structure of the document, or flag
482
+ * constructs that may not be allowed by a style guide, or track the
483
+ * prevalence of incorrect or tricky HTML code.
347
484
  */
348
485
  typedef enum {
349
486
  /**
350
- * A normal node - both start and end tags appear in the source, nothing has
351
- * been reparented.
487
+ * A normal node -- both start and end tags appear in the source,
488
+ * nothing has been reparented.
352
489
  */
353
490
  GUMBO_INSERTION_NORMAL = 0,
354
491
 
355
492
  /**
356
- * A node inserted by the parser to fulfill some implicit insertion rule.
357
- * This is usually set in addition to some other flag giving a more specific
358
- * insertion reason; it's a generic catch-all term meaning "The start tag for
359
- * this node did not appear in the document source".
493
+ * A node inserted by the parser to fulfill some implicit insertion
494
+ * rule. This is usually set in addition to some other flag giving a
495
+ * more specific insertion reason; it's a generic catch-all term
496
+ * meaning "The start tag for this node did not appear in the document
497
+ * source".
360
498
  */
361
499
  GUMBO_INSERTION_BY_PARSER = 1 << 0,
362
500
 
363
501
  /**
364
- * A flag indicating that the end tag for this node did not appear in the
365
- * document source. Note that in some cases, you can still have
366
- * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
367
- * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
368
- * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
369
- * exists. This flag will be set only if the end tag is completely missing;
370
- * in some cases, the end tag may be misplaced (eg. a </body> tag with text
371
- * afterwards), which will leave this flag unset and require clients to
372
- * inspect the parse errors for that case.
502
+ * A flag indicating that the end tag for this node did not appear in
503
+ * the document source. Note that in some cases, you can still have
504
+ * parser-inserted nodes with an explicit end tag. For example,
505
+ * `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
506
+ * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
507
+ * `</html>` tag actually exists.
508
+ *
509
+ * This flag will be set only if the end tag is completely missing.
510
+ * In some cases, the end tag may be misplaced (e.g. a `</body>` tag
511
+ * with text afterwards), which will leave this flag unset and require
512
+ * clients to inspect the parse errors for that case.
373
513
  */
374
514
  GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
375
515
 
376
516
  // Value 1 << 2 was for a flag that has since been removed.
377
517
 
378
518
  /**
379
- * A flag for nodes that are inserted because their presence is implied by
380
- * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
519
+ * A flag for nodes that are inserted because their presence is
520
+ * implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
521
+ * `<tbody>`, etc.
381
522
  */
382
523
  GUMBO_INSERTION_IMPLIED = 1 << 3,
383
524
 
384
525
  /**
385
- * A flag for nodes that are converted from their end tag equivalents. For
386
- * example, </p> when no paragraph is open implies that the parser should
387
- * create a <p> tag and immediately close it, while </br> means the same thing
388
- * as <br>.
526
+ * A flag for nodes that are converted from their end tag equivalents.
527
+ * For example, `</p>` when no paragraph is open implies that the
528
+ * parser should create a `<p>` tag and immediately close it, while
529
+ * `</br>` means the same thing as `<br>`.
389
530
  */
390
531
  GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
391
532
 
392
- /** A flag for nodes that are converted from the parse of an <isindex> tag. */
393
- GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
533
+ // Value 1 << 5 was for a flag that has since been removed.
394
534
 
395
- /** A flag for <image> tags that are rewritten as <img>. */
535
+ /** A flag for `<image>` tags that are rewritten as `<img>`. */
396
536
  GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
397
537
 
398
538
  /**
399
- * A flag for nodes that are cloned as a result of the reconstruction of
400
- * active formatting elements. This is set only on the clone; the initial
401
- * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
539
+ * A flag for nodes that are cloned as a result of the reconstruction
540
+ * of active formatting elements. This is set only on the clone; the
541
+ * initial portion of the formatting run is a NORMAL node with an
542
+ * `IMPLICIT_END_TAG`.
402
543
  */
403
544
  GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
404
545
 
@@ -415,18 +556,19 @@ typedef enum {
415
556
  GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
416
557
  } GumboParseFlags;
417
558
 
418
- /**
419
- * Information specific to document nodes.
420
- */
559
+ /** Information specific to document nodes. */
421
560
  typedef struct {
422
561
  /**
423
- * An array of GumboNodes, containing the children of this element. This will
424
- * normally consist of the <html> element and any comment nodes found.
425
- * Pointers are owned.
562
+ * An array of `GumboNode`s, containing the children of this element.
563
+ * This will normally consist of the `<html>` element and any comment
564
+ * nodes found. Pointers are owned.
426
565
  */
427
566
  GumboVector /* GumboNode* */ children;
428
567
 
429
- // True if there was an explicit doctype token as opposed to it being omitted.
568
+ /**
569
+ * `true` if there was an explicit doctype token, as opposed to it
570
+ * being omitted.
571
+ */
430
572
  bool has_doctype;
431
573
 
432
574
  // Fields from the doctype token, copied verbatim.
@@ -435,65 +577,70 @@ typedef struct {
435
577
  const char* system_identifier;
436
578
 
437
579
  /**
438
- * Whether or not the document is in QuirksMode, as determined by the values
439
- * in the GumboTokenDocType template.
580
+ * Whether or not the document is in QuirksMode, as determined by the
581
+ * values in the GumboTokenDocType template.
440
582
  */
441
583
  GumboQuirksModeEnum doc_type_quirks_mode;
442
584
  } GumboDocument;
443
585
 
444
586
  /**
445
- * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
446
- * This contains just a block of text and its position.
587
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
588
+ * elements. This contains just a block of text and its position.
447
589
  */
448
590
  typedef struct {
449
591
  /**
450
- * The text of this node, after entities have been parsed and decoded. For
451
- * comment/cdata nodes, this does not include the comment delimiters.
592
+ * The text of this node, after entities have been parsed and decoded.
593
+ * For comment and cdata nodes, this does not include the comment
594
+ * delimiters.
452
595
  */
453
596
  const char* text;
454
597
 
455
598
  /**
456
- * The original text of this node, as a pointer into the original buffer. For
457
- * comment/cdata nodes, this includes the comment delimiters.
599
+ * The original text of this node, as a pointer into the original
600
+ * buffer. For comment/cdata nodes, this includes the comment
601
+ * delimiters.
458
602
  */
459
603
  GumboStringPiece original_text;
460
604
 
461
605
  /**
462
- * The starting position of this node. This corresponds to the position of
463
- * original_text, before entities are decoded.
606
+ * The starting position of this node. This corresponds to the
607
+ * position of `original_text`, before entities are decoded.
464
608
  * */
465
609
  GumboSourcePosition start_pos;
466
610
  } GumboText;
467
611
 
468
612
  /**
469
- * The struct used to represent all HTML elements. This contains information
470
- * about the tag, attributes, and child nodes.
613
+ * The struct used to represent all HTML elements. This contains
614
+ * information about the tag, attributes, and child nodes.
471
615
  */
472
616
  typedef struct {
473
617
  /**
474
- * An array of GumboNodes, containing the children of this element. Pointers
475
- * are owned.
618
+ * An array of `GumboNode`s, containing the children of this element.
619
+ * Pointers are owned.
476
620
  */
477
621
  GumboVector /* GumboNode* */ children;
478
622
 
479
623
  /** The GumboTag enum for this element. */
480
624
  GumboTag tag;
481
625
 
626
+ /** The name for this element. */
627
+ const char* name;
628
+
482
629
  /** The GumboNamespaceEnum for this element. */
483
630
  GumboNamespaceEnum tag_namespace;
484
631
 
485
632
  /**
486
- * A GumboStringPiece pointing to the original tag text for this element,
487
- * pointing directly into the source buffer. If the tag was inserted
488
- * algorithmically (for example, <head> or <tbody> insertion), this will be a
489
- * zero-length string.
633
+ * A `GumboStringPiece` pointing to the original tag text for this
634
+ * element, pointing directly into the source buffer. If the tag was
635
+ * inserted algorithmically (for example, `<head>` or `<tbody>`
636
+ * insertion), this will be a zero-length string.
490
637
  */
491
638
  GumboStringPiece original_tag;
492
639
 
493
640
  /**
494
- * A GumboStringPiece pointing to the original end tag text for this element.
495
- * If the end tag was inserted algorithmically, (for example, closing a
496
- * self-closing tag), this will be a zero-length string.
641
+ * A `GumboStringPiece` pointing to the original end tag text for this
642
+ * element. If the end tag was inserted algorithmically, (for example,
643
+ * closing a self-closing tag), this will be a zero-length string.
497
644
  */
498
645
  GumboStringPiece original_end_tag;
499
646
 
@@ -504,30 +651,31 @@ typedef struct {
504
651
  GumboSourcePosition end_pos;
505
652
 
506
653
  /**
507
- * An array of GumboAttributes, containing the attributes for this tag in the
508
- * order that they were parsed. Pointers are owned.
654
+ * An array of `GumboAttribute`s, containing the attributes for this
655
+ * tag in the order that they were parsed. Pointers are owned.
509
656
  */
510
657
  GumboVector /* GumboAttribute* */ attributes;
511
658
  } GumboElement;
512
659
 
513
660
  /**
514
- * A supertype for GumboElement and GumboText, so that we can include one
515
- * generic type in lists of children and cast as necessary to subtypes.
661
+ * A supertype for `GumboElement` and `GumboText`, so that we can
662
+ * include one generic type in lists of children and cast as necessary
663
+ * to subtypes.
516
664
  */
517
665
  struct GumboInternalNode {
518
666
  /** The type of node that this is. */
519
667
  GumboNodeType type;
520
668
 
521
- /** Pointer back to parent node. Not owned. */
669
+ /** Pointer back to parent node. Not owned. */
522
670
  GumboNode* parent;
523
671
 
524
672
  /** The index within the parent's children vector of this node. */
525
- size_t index_within_parent;
673
+ unsigned int index_within_parent;
526
674
 
527
675
  /**
528
- * A bitvector of flags containing information about why this element was
529
- * inserted into the parse tree, including a variety of special parse
530
- * situations.
676
+ * A bitvector of flags containing information about why this element
677
+ * was inserted into the parse tree, including a variety of special
678
+ * parse situations.
531
679
  */
532
680
  GumboParseFlags parse_flags;
533
681
 
@@ -539,133 +687,257 @@ struct GumboInternalNode {
539
687
  } v;
540
688
  };
541
689
 
542
- /**
543
- * The type for an allocator function. Takes the 'userdata' member of the
544
- * GumboParser struct as its first argument. Semantics should be the same as
545
- * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
546
- * Allocating a block of 0 bytes behaves as per malloc.
547
- */
548
- // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
549
- typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
550
-
551
- /**
552
- * The type for a deallocator function. Takes the 'userdata' member of the
553
- * GumboParser struct as its first argument.
554
- */
555
- typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
556
-
557
690
  /**
558
691
  * Input struct containing configuration options for the parser.
559
- * These let you specify alternate memory managers, provide different error
560
- * handling, etc.
561
- * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
692
+ * These let you specify alternate memory managers, provide different
693
+ * error handling, etc. Use `kGumboDefaultOptions` for sensible
694
+ * defaults and only set what you need.
562
695
  */
563
696
  typedef struct GumboInternalOptions {
564
- /** A memory allocator function. Default: malloc. */
565
- GumboAllocatorFunction allocator;
566
-
567
- /** A memory deallocator function. Default: free. */
568
- GumboDeallocatorFunction deallocator;
697
+ /**
698
+ * The tab-stop size, for computing positions in HTML files that
699
+ * use tabs. Default: `8`.
700
+ */
701
+ int tab_stop;
569
702
 
570
703
  /**
571
- * An opaque object that's passed in as the first argument to all callbacks
572
- * used by this library. Default: NULL.
704
+ * Whether or not to stop parsing when the first error is encountered.
705
+ * Default: `false`.
573
706
  */
574
- void* userdata;
707
+ bool stop_on_first_error;
575
708
 
576
709
  /**
577
- * The tab-stop size, for computing positions in source code that uses tabs.
578
- * Default: 8.
710
+ * Maximum allowed number of attributes per element. If this limit is
711
+ * exceeded, the parser will return early with a partial document and
712
+ * the returned `GumboOutput` will have its `status` field set to
713
+ * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
714
+ * Default: `400`.
579
715
  */
580
- int tab_stop;
716
+ int max_attributes;
581
717
 
582
718
  /**
583
- * Whether or not to stop parsing when the first error is encountered.
584
- * Default: false.
719
+ * Maximum allowed depth for the parse tree. If this limit is exceeded,
720
+ * the parser will return early with a partial document and the returned
721
+ * `GumboOutput` will have its `status` field set to
722
+ * `GUMBO_STATUS_TREE_TOO_DEEP`.
723
+ * Default: `400`.
585
724
  */
586
- bool stop_on_first_error;
725
+ unsigned int max_tree_depth;
587
726
 
588
727
  /**
589
- * The maximum number of errors before the parser stops recording them. This
590
- * is provided so that if the page is totally borked, we don't completely fill
591
- * up the errors vector and exhaust memory with useless redundant errors. Set
592
- * to -1 to disable the limit.
593
- * Default: -1
728
+ * The maximum number of errors before the parser stops recording
729
+ * them. This is provided so that if the page is totally borked, we
730
+ * don't completely fill up the errors vector and exhaust memory with
731
+ * useless redundant errors. Set to `-1` to disable the limit.
732
+ * Default: `-1`.
594
733
  */
595
734
  int max_errors;
596
735
 
597
736
  /**
598
737
  * The fragment context for parsing:
599
- * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
738
+ * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
600
739
  *
601
- * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
602
- * the regular parsing algorithm. Otherwise, pass the tag enum for the
603
- * intended parent of the parsed fragment. We use just the tag enum rather
604
- * than a full node because that's enough to set all the parsing context we
605
- * need, and it provides some additional flexibility for client code to act as
606
- * if parsing a fragment even when a full HTML tree isn't available.
740
+ * If `NULL` is passed here, it is assumed to be "no
741
+ * fragment", i.e. the regular parsing algorithm. Otherwise, pass the
742
+ * tag name for the intended parent of the parsed fragment. We use the
743
+ * tag name, namespace, and encoding attribute which are sufficient to
744
+ * set all of the parsing context needed for fragment parsing.
607
745
  *
608
- * Default: GUMBO_TAG_LAST
746
+ * Default: `NULL`.
609
747
  */
610
- GumboTag fragment_context;
748
+ const char* fragment_context;
611
749
 
612
750
  /**
613
- * The namespace for the fragment context. This lets client code
614
- * differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
615
- * HTML.
616
- * Default: GUMBO_NAMESPACE_HTML
751
+ * The namespace for the fragment context. This lets client code
752
+ * differentiate between, say, parsing a `<title>` tag in SVG vs.
753
+ * parsing it in HTML.
754
+ *
755
+ * Default: `GUMBO_NAMESPACE_HTML`.
617
756
  */
618
757
  GumboNamespaceEnum fragment_namespace;
758
+
759
+ /**
760
+ * The value of the fragment context's `encoding` attribute, if any.
761
+ * Set to `NULL` for no `encoding` attribute.
762
+ *
763
+ * Default: `NULL`.
764
+ */
765
+ const char* fragment_encoding;
766
+
767
+ /**
768
+ * Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
769
+ * be looked up using `gumbo_compute_quirks_mode()`.
770
+ *
771
+ * Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
772
+ */
773
+ GumboQuirksModeEnum quirks_mode;
774
+
775
+ /**
776
+ * For fragment parsing. Set this to true if the context node has a form
777
+ * element as an ancestor.
778
+ *
779
+ * Default: `false`.
780
+ */
781
+ bool fragment_context_has_form_ancestor;
619
782
  } GumboOptions;
620
783
 
621
784
  /** Default options struct; use this with gumbo_parse_with_options. */
622
785
  extern const GumboOptions kGumboDefaultOptions;
623
786
 
787
+ /**
788
+ * Status code indicating whether parsing finished successfully or
789
+ * was stopped mid-document due to exceptional circumstances.
790
+ */
791
+ typedef enum {
792
+ /**
793
+ * Indicates that parsing completed successfuly. The resulting tree
794
+ * will be a complete document.
795
+ */
796
+ GUMBO_STATUS_OK,
797
+
798
+ /**
799
+ * Indicates that the maximum element nesting limit
800
+ * (`GumboOptions::max_tree_depth`) was reached during parsing. The
801
+ * resulting tree will be a partial document, with no further nodes
802
+ * created after the point where the limit was reached. The partial
803
+ * document may be useful for constructing an error message but
804
+ * typically shouldn't be used for other purposes.
805
+ */
806
+ GUMBO_STATUS_TREE_TOO_DEEP,
807
+
808
+ /**
809
+ * Indicates that the maximum number of attributes per element
810
+ * (`GumboOptions::max_attributes`) was reached during parsing. The
811
+ * resulting tree will be a partial document, with no further nodes
812
+ * created after the point where the limit was reached. The partial
813
+ * document may be useful for constructing an error message but
814
+ * typically shouldn't be used for other purposes.
815
+ */
816
+ GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
817
+
818
+ // Currently unused
819
+ GUMBO_STATUS_OUT_OF_MEMORY,
820
+ } GumboOutputStatus;
821
+
822
+
624
823
  /** The output struct containing the results of the parse. */
625
824
  typedef struct GumboInternalOutput {
626
825
  /**
627
- * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
628
- * that contains the entire document as its child.
826
+ * Pointer to the document node. This is a `GumboNode` of type
827
+ * `NODE_DOCUMENT` that contains the entire document as its child.
629
828
  */
630
829
  GumboNode* document;
631
830
 
632
831
  /**
633
- * Pointer to the root node. This the <html> tag that forms the root of the
634
- * document.
832
+ * Pointer to the root node. This is the `<html>` tag that forms the
833
+ * root of the document.
635
834
  */
636
835
  GumboNode* root;
637
836
 
638
837
  /**
639
838
  * A list of errors that occurred during the parse.
640
- * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
641
- * fleshed out and may change in the future. For this reason, the GumboError
642
- * header isn't part of the public API. Contact us if you need errors
643
- * reported so we can work out something appropriate for your use-case.
644
839
  */
645
840
  GumboVector /* GumboError */ errors;
841
+
842
+ /**
843
+ * True if the parser encounted an error.
844
+ *
845
+ * This can be true and `errors` an empty `GumboVector` if the `max_errors`
846
+ * option was set to 0.
847
+ */
848
+ bool document_error;
849
+
850
+ /**
851
+ * A status code indicating whether parsing finished successfully or was
852
+ * stopped mid-document due to exceptional circumstances.
853
+ */
854
+ GumboOutputStatus status;
646
855
  } GumboOutput;
647
856
 
648
857
  /**
649
- * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
650
- * live at least as long as the parse tree, as some fields (eg. original_text)
651
- * point directly into the original buffer.
858
+ * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
859
+ * buffer must live at least as long as the parse tree, as some fields
860
+ * (eg. `original_text`) point directly into the original buffer.
652
861
  *
653
862
  * This doesn't support buffers longer than 4 gigabytes.
654
863
  */
655
864
  GumboOutput* gumbo_parse(const char* buffer);
656
865
 
657
866
  /**
658
- * Extended version of gumbo_parse that takes an explicit options structure,
659
- * buffer, and length.
867
+ * Extended version of `gumbo_parse` that takes an explicit options
868
+ * structure, buffer, and length.
869
+ */
870
+ GumboOutput* gumbo_parse_with_options (
871
+ const GumboOptions* options,
872
+ const char* buffer,
873
+ size_t buffer_length
874
+ );
875
+
876
+ /**
877
+ * Compute the quirks mode based on the name, public identifier, and system
878
+ * identifier. Any of these may be `NULL` to indicate a missing value.
879
+ */
880
+ GumboQuirksModeEnum gumbo_compute_quirks_mode (
881
+ const char *name,
882
+ const char *pubid,
883
+ const char *sysid
884
+ );
885
+
886
+ /** Convert a `GumboOutputStatus` code into a readable description. */
887
+ const char* gumbo_status_to_string(GumboOutputStatus status);
888
+
889
+ /** Release the memory used for the parse tree and parse errors. */
890
+ void gumbo_destroy_output(GumboOutput* output);
891
+
892
+ /** Opaque GumboError type */
893
+ typedef struct GumboInternalError GumboError;
894
+
895
+ /**
896
+ * Returns the position of the error.
660
897
  */
661
- GumboOutput* gumbo_parse_with_options(
662
- const GumboOptions* options, const char* buffer, size_t buffer_length);
898
+ GumboSourcePosition gumbo_error_position(const GumboError* error);
663
899
 
664
- /** Release the memory used for the parse tree & parse errors. */
665
- void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);
900
+ /**
901
+ * Returns a constant string representation of the error's code. This is owned
902
+ * by the library and should not be freed by the caller.
903
+ */
904
+ const char* gumbo_error_code(const GumboError* error);
905
+
906
+ /**
907
+ * Prints an error to a string. This stores a freshly-allocated buffer
908
+ * containing the error message text in output. The caller is responsible for
909
+ * freeing the buffer. The size of the error message is returned. The error
910
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
911
+ * returned size must be used.
912
+ */
913
+ size_t gumbo_error_to_string(const GumboError* error, char **output);
914
+
915
+ /**
916
+ * Prints a caret diagnostic to a string. This stores a freshly-allocated
917
+ * buffer containing the error message text in output. The caller is responsible for
918
+ * freeing the buffer. The size of the error message is returned. The error
919
+ * message itself may not be NULL-terminated and may contain NULL bytes so the
920
+ * returned size must be used.
921
+ */
922
+ size_t gumbo_caret_diagnostic_to_string (
923
+ const GumboError* error,
924
+ const char* source_text,
925
+ size_t source_length,
926
+ char** output
927
+ );
928
+
929
+ /**
930
+ * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
931
+ * instead of writing to a string.
932
+ */
933
+ void gumbo_print_caret_diagnostic (
934
+ const GumboError* error,
935
+ const char* source_text,
936
+ size_t source_length
937
+ );
666
938
 
667
939
  #ifdef __cplusplus
668
940
  }
669
941
  #endif
670
942
 
671
- #endif // GUMBO_GUMBO_H_
943
+ #endif // GUMBO_H