nokogumbo 1.5.0 → 2.0.0.pre.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +56 -0
  3. data/README.md +146 -22
  4. data/ext/nokogumbo/extconf.rb +116 -0
  5. data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
  6. data/gumbo-parser/src/ascii.c +33 -0
  7. data/gumbo-parser/src/ascii.h +31 -0
  8. data/gumbo-parser/src/attribute.c +26 -28
  9. data/gumbo-parser/src/attribute.h +3 -23
  10. data/gumbo-parser/src/char_ref.c +135 -2351
  11. data/gumbo-parser/src/char_ref.h +13 -29
  12. data/gumbo-parser/src/error.c +215 -133
  13. data/gumbo-parser/src/error.h +34 -49
  14. data/gumbo-parser/src/foreign_attrs.c +104 -0
  15. data/gumbo-parser/src/gumbo.h +506 -304
  16. data/gumbo-parser/src/insertion_mode.h +4 -28
  17. data/gumbo-parser/src/macros.h +91 -0
  18. data/gumbo-parser/src/parser.c +1989 -1431
  19. data/gumbo-parser/src/parser.h +6 -22
  20. data/gumbo-parser/src/replacement.h +33 -0
  21. data/gumbo-parser/src/string_buffer.c +43 -50
  22. data/gumbo-parser/src/string_buffer.h +24 -40
  23. data/gumbo-parser/src/string_piece.c +39 -39
  24. data/gumbo-parser/src/svg_attrs.c +174 -0
  25. data/gumbo-parser/src/svg_tags.c +137 -0
  26. data/gumbo-parser/src/tag.c +186 -59
  27. data/gumbo-parser/src/tag_lookup.c +382 -0
  28. data/gumbo-parser/src/tag_lookup.h +13 -0
  29. data/gumbo-parser/src/token_type.h +1 -25
  30. data/gumbo-parser/src/tokenizer.c +899 -495
  31. data/gumbo-parser/src/tokenizer.h +37 -37
  32. data/gumbo-parser/src/tokenizer_states.h +6 -22
  33. data/gumbo-parser/src/utf8.c +103 -86
  34. data/gumbo-parser/src/utf8.h +37 -41
  35. data/gumbo-parser/src/util.c +48 -38
  36. data/gumbo-parser/src/util.h +10 -40
  37. data/gumbo-parser/src/vector.c +45 -57
  38. data/gumbo-parser/src/vector.h +17 -39
  39. data/lib/nokogumbo.rb +10 -174
  40. data/lib/nokogumbo/html5.rb +250 -0
  41. data/lib/nokogumbo/html5/document.rb +37 -0
  42. data/lib/nokogumbo/html5/document_fragment.rb +46 -0
  43. data/lib/nokogumbo/version.rb +3 -0
  44. data/lib/nokogumbo/xml/node.rb +57 -0
  45. metadata +32 -19
  46. data/ext/nokogumboc/extconf.rb +0 -60
  47. data/gumbo-parser/src/char_ref.rl +0 -2554
  48. data/gumbo-parser/src/string_piece.h +0 -38
  49. data/gumbo-parser/src/tag.in +0 -150
  50. data/gumbo-parser/src/tag_enum.h +0 -153
  51. data/gumbo-parser/src/tag_gperf.h +0 -105
  52. data/gumbo-parser/src/tag_sizes.h +0 -4
  53. data/gumbo-parser/src/tag_strings.h +0 -153
  54. data/gumbo-parser/visualc/include/strings.h +0 -4
  55. data/test-nokogumbo.rb +0 -190
@@ -1,26 +1,6 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Error types, enums, and handling functions.
18
-
19
1
  #ifndef GUMBO_ERROR_H_
20
2
  #define GUMBO_ERROR_H_
21
- #ifdef _MSC_VER
22
- #define _CRT_SECURE_NO_WARNINGS
23
- #endif
3
+
24
4
  #include <stdint.h>
25
5
 
26
6
  #include "gumbo.h"
@@ -77,11 +57,12 @@ typedef enum {
77
57
  GUMBO_ERR_DOCTYPE_END,
78
58
  GUMBO_ERR_PARSER,
79
59
  GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
60
+ GUMBO_ERR_SELF_CLOSING_END_TAG,
80
61
  } GumboErrorType;
81
62
 
82
63
  // Additional data for duplicated attributes.
83
64
  typedef struct GumboInternalDuplicateAttrError {
84
- // The name of the attribute. Owned by this struct.
65
+ // The name of the attribute. Owned by this struct.
85
66
  const char* name;
86
67
 
87
68
  // The (0-based) index within the attributes vector of the original
@@ -93,7 +74,7 @@ typedef struct GumboInternalDuplicateAttrError {
93
74
  } GumboDuplicateAttrError;
94
75
 
95
76
  // A simplified representation of the tokenizer state, designed to be more
96
- // useful to clients of this library than the internal representation. This
77
+ // useful to clients of this library than the internal representation. This
97
78
  // condenses the actual states used in the tokenizer state machine into a few
98
79
  // values that will be familiar to users of HTML.
99
80
  typedef enum {
@@ -129,20 +110,20 @@ typedef struct GumboInternalParserError {
129
110
  // The type of input token that resulted in this error.
130
111
  GumboTokenType input_type;
131
112
 
132
- // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
113
+ // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
133
114
  GumboTag input_tag;
134
115
 
135
116
  // The insertion mode that the parser was in at the time.
136
117
  GumboInsertionMode parser_state;
137
118
 
138
- // The tag stack at the point of the error. Note that this is an GumboVector
119
+ // The tag stack at the point of the error. Note that this is an GumboVector
139
120
  // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
140
121
  // get at the tag.
141
122
  GumboVector /* GumboTag */ tag_stack;
142
123
  } GumboParserError;
143
124
 
144
125
  // The overall error struct representing an error in decoding/tokenizing/parsing
145
- // the HTML. This contains an enumerated type flag, a source position, and then
126
+ // the HTML. This contains an enumerated type flag, a source position, and then
146
127
  // a union of fields containing data specific to the error.
147
128
  typedef struct GumboInternalError {
148
129
  // The type of error.
@@ -163,7 +144,7 @@ typedef struct GumboInternalError {
163
144
  // * GUMBO_ERR_UTF8_TRUNCATED
164
145
  // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
165
146
  // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
166
- uint64_t codepoint;
147
+ uint32_t codepoint;
167
148
 
168
149
  // Tokenizer errors.
169
150
  GumboTokenizerError tokenizer;
@@ -183,7 +164,7 @@ typedef struct GumboInternalError {
183
164
  } GumboError;
184
165
 
185
166
  // Adds a new error to the parser's error list, and returns a pointer to it so
186
- // that clients can fill out the rest of its fields. May return NULL if we're
167
+ // that clients can fill out the rest of its fields. May return NULL if we're
187
168
  // already over the max_errors field specified in GumboOptions.
188
169
  GumboError* gumbo_add_error(struct GumboInternalParser* parser);
189
170
 
@@ -194,32 +175,36 @@ void gumbo_init_errors(struct GumboInternalParser* errors);
194
175
  void gumbo_destroy_errors(struct GumboInternalParser* errors);
195
176
 
196
177
  // Frees the memory used for a single GumboError.
197
- void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
198
-
199
- // Prints an error to a string. This fills an empty GumboStringBuffer with a
200
- // freshly-allocated buffer containing the error message text. The caller is
201
- // responsible for deleting the buffer. (Note that the buffer is allocated with
202
- // the allocator specified in the GumboParser config and hence should be freed
203
- // by gumbo_parser_deallocate().)
204
- void gumbo_error_to_string(struct GumboInternalParser* parser,
205
- const GumboError* error, GumboStringBuffer* output);
206
-
207
- // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
208
- // with a freshly-allocated buffer containing the error message text. The
209
- // caller is responsible for deleting the buffer. (Note that the buffer is
210
- // allocated with the allocator specified in the GumboParser config and hence
211
- // should be freed by gumbo_parser_deallocate().)
212
- void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
213
- const GumboError* error, const char* source_text,
214
- GumboStringBuffer* output);
178
+ void gumbo_error_destroy(GumboError* error);
179
+
180
+ // Prints an error to a string. This fills an empty GumboStringBuffer with a
181
+ // freshly-allocated buffer containing the error message text. The caller is
182
+ // responsible for freeing the buffer.
183
+ void gumbo_error_to_string (
184
+ const GumboError* error,
185
+ GumboStringBuffer* output
186
+ );
187
+
188
+ // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
189
+ // with a freshly-allocated buffer containing the error message text. The
190
+ // caller is responsible for freeing the buffer.
191
+ void gumbo_caret_diagnostic_to_string (
192
+ const GumboError* error,
193
+ const char* source_text,
194
+ size_t source_length,
195
+ GumboStringBuffer* output
196
+ );
215
197
 
216
198
  // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
217
199
  // of writing to a string.
218
- void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
219
- const GumboError* error, const char* source_text);
200
+ void gumbo_print_caret_diagnostic (
201
+ const GumboError* error,
202
+ const char* source_text,
203
+ size_t source_length
204
+ );
220
205
 
221
206
  #ifdef __cplusplus
222
207
  }
223
208
  #endif
224
209
 
225
- #endif // GUMBO_ERROR_H_
210
+ #endif // GUMBO_ERROR_H_
@@ -0,0 +1,104 @@
1
+ /* ANSI-C code produced by gperf version 3.1 */
2
+ /* Command-line: gperf -m100 -n lib/foreign_attrs.gperf */
3
+ /* Computed positions: -k'2,8' */
4
+ /* Filtered by: mk/gperf-filter.sed */
5
+
6
+ #include "replacement.h"
7
+ #include "macros.h"
8
+ #include <string.h>
9
+
10
+ #define TOTAL_KEYWORDS 11
11
+ #define MIN_WORD_LENGTH 5
12
+ #define MAX_WORD_LENGTH 13
13
+ #define MIN_HASH_VALUE 0
14
+ #define MAX_HASH_VALUE 10
15
+ /* maximum key range = 11, duplicates = 0 */
16
+
17
+ static inline unsigned int
18
+ hash (register const char *str, register size_t len)
19
+ {
20
+ static const unsigned char asso_values[] =
21
+ {
22
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
23
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
24
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
25
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
26
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
27
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
28
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
29
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
30
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
31
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 2,
32
+ 11, 10, 11, 9, 7, 6, 11, 11, 1, 0,
33
+ 11, 5, 11, 11, 4, 11, 11, 11, 11, 11,
34
+ 11, 3, 11, 11, 11, 11, 11, 11, 11, 11,
35
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
36
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
37
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
38
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
39
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
40
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
41
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
42
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
43
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
44
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
45
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
46
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
47
+ 11, 11, 11, 11, 11, 11
48
+ };
49
+ register unsigned int hval = 0;
50
+
51
+ switch (len)
52
+ {
53
+ default:
54
+ hval += asso_values[(unsigned char)str[7]];
55
+ /*FALLTHROUGH*/
56
+ case 7:
57
+ case 6:
58
+ case 5:
59
+ case 4:
60
+ case 3:
61
+ case 2:
62
+ hval += asso_values[(unsigned char)str[1]];
63
+ break;
64
+ }
65
+ return hval;
66
+ }
67
+
68
+ const ForeignAttrReplacement *
69
+ gumbo_get_foreign_attr_replacement (register const char *str, register size_t len)
70
+ {
71
+ static const unsigned char lengthtable[] =
72
+ {
73
+ 5, 11, 9, 13, 10, 10, 10, 11, 10, 8, 8
74
+ };
75
+ static const ForeignAttrReplacement wordlist[] =
76
+ {
77
+ {"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
78
+ {"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
79
+ {"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
80
+ {"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
81
+ {"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
82
+ {"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
83
+ {"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
84
+ {"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
85
+ {"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
86
+ {"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
87
+ {"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}
88
+ };
89
+
90
+ if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
91
+ {
92
+ register unsigned int key = hash (str, len);
93
+
94
+ if (key <= MAX_HASH_VALUE)
95
+ if (len == lengthtable[key])
96
+ {
97
+ register const char *s = wordlist[key].from;
98
+
99
+ if (s && *str == *s && !memcmp (str + 1, s + 1, len - 1))
100
+ return &wordlist[key];
101
+ }
102
+ }
103
+ return 0;
104
+ }
@@ -1,51 +1,33 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18
- // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19
- // kGumbo prefix).
1
+ // Copyright 2010 Google Inc.
2
+ // Copyright 2018 Craig Barnes.
3
+ // Licensed under the Apache License, version 2.0.
4
+
5
+ // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
6
+ // GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
7
+ // static constants
20
8
 
21
9
  /**
22
10
  * @file
23
11
  * @mainpage Gumbo HTML Parser
24
12
  *
25
- * This provides a conformant, no-dependencies implementation of the HTML5
26
- * parsing algorithm. It supports only UTF8; if you need to parse a different
27
- * encoding, run a preprocessing step to convert to UTF8. It returns a parse
28
- * tree made of the structs in this file.
13
+ * This provides a conformant, no-dependencies implementation of the
14
+ * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
15
+ * to parse a different encoding, run a preprocessing step to convert
16
+ * to UTF-8. It returns a parse tree made of the structs in this file.
29
17
  *
30
18
  * Example:
31
19
  * @code
32
20
  * GumboOutput* output = gumbo_parse(input);
33
21
  * do_something_with_doctype(output->document);
34
22
  * do_something_with_html_tree(output->root);
35
- * gumbo_destroy_output(&options, output);
23
+ * gumbo_destroy_output(output);
36
24
  * @endcode
37
- * HTML5 Spec:
38
25
  *
39
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
26
+ * [HTML5]: https://html.spec.whatwg.org/multipage/
40
27
  */
41
28
 
42
- #ifndef GUMBO_GUMBO_H_
43
- #define GUMBO_GUMBO_H_
44
-
45
- #ifdef _MSC_VER
46
- #define _CRT_SECURE_NO_WARNINGS
47
- #define fileno _fileno
48
- #endif
29
+ #ifndef GUMBO_H
30
+ #define GUMBO_H
49
31
 
50
32
  #include <stdbool.h>
51
33
  #include <stddef.h>
@@ -55,73 +37,77 @@ extern "C" {
55
37
  #endif
56
38
 
57
39
  /**
58
- * A struct representing a character position within the original text buffer.
59
- * Line and column numbers are 1-based and offsets are 0-based, which matches
60
- * how most editors and command-line tools work. Also, columns measure
61
- * positions in terms of characters while offsets measure by bytes; this is
62
- * because the offset field is often used to pull out a particular region of
63
- * text (which in most languages that bind to C implies pointer arithmetic on a
64
- * buffer of bytes), while the column field is often used to reference a
65
- * particular column on a printable display, which nowadays is usually UTF-8.
40
+ * A struct representing a character position within the original text
41
+ * buffer. Line and column numbers are 1-based and offsets are 0-based,
42
+ * which matches how most editors and command-line tools work.
66
43
  */
67
44
  typedef struct {
68
- unsigned int line;
69
- unsigned int column;
70
- unsigned int offset;
45
+ size_t line;
46
+ size_t column;
47
+ size_t offset;
71
48
  } GumboSourcePosition;
72
49
 
73
50
  /**
74
- * A SourcePosition used for elements that have no source position, i.e.
75
- * parser-inserted elements.
76
- */
77
- extern const GumboSourcePosition kGumboEmptySourcePosition;
78
-
79
- /**
80
- * A struct representing a string or part of a string. Strings within the
81
- * parser are represented by a char* and a length; the char* points into
82
- * an existing data buffer owned by some other code (often the original input).
83
- * GumboStringPieces are assumed (by convention) to be immutable, because they
84
- * may share data. Use GumboStringBuffer if you need to construct a string.
85
- * Clients should assume that it is not NUL-terminated, and should always use
86
- * explicit lengths when manipulating them.
51
+ * A struct representing a string or part of a string. Strings within
52
+ * the parser are represented by a `char*` and a length; the `char*`
53
+ * points into an existing data buffer owned by some other code (often
54
+ * the original input). `GumboStringPiece`s are assumed (by convention)
55
+ * to be immutable, because they may share data. Clients should assume
56
+ * that it is not NUL-terminated and should always use explicit lengths
57
+ * when manipulating them.
87
58
  */
88
59
  typedef struct {
89
- /** A pointer to the beginning of the string. NULL iff length == 0. */
60
+ /** A pointer to the beginning of the string. `NULL` if `length == 0`. */
90
61
  const char* data;
91
62
 
92
- /** The length of the string fragment, in bytes. May be zero. */
63
+ /** The length of the string fragment, in bytes (may be zero). */
93
64
  size_t length;
94
65
  } GumboStringPiece;
95
66
 
67
+ #define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
96
68
  /** A constant to represent a 0-length null string. */
97
- extern const GumboStringPiece kGumboEmptyString;
69
+ #define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
70
+
71
+ /**
72
+ * Compares two `GumboStringPiece`s, and returns `true` if they're
73
+ * equal or `false` otherwise.
74
+ */
75
+ bool gumbo_string_equals (
76
+ const GumboStringPiece* str1,
77
+ const GumboStringPiece* str2
78
+ );
98
79
 
99
80
  /**
100
- * Compares two GumboStringPieces, and returns true if they're equal or false
101
- * otherwise.
81
+ * Compares two `GumboStringPiece`s, ignoring case, and returns `true`
82
+ * if they're equal or `false` otherwise.
102
83
  */
103
- bool gumbo_string_equals(
104
- const GumboStringPiece* str1, const GumboStringPiece* str2);
84
+ bool gumbo_string_equals_ignore_case (
85
+ const GumboStringPiece* str1,
86
+ const GumboStringPiece* str2
87
+ );
105
88
 
106
89
  /**
107
- * Compares two GumboStringPieces ignoring case, and returns true if they're
108
- * equal or false otherwise.
90
+ * Check if the first `GumboStringPiece` is a prefix of the second, ignoring
91
+ * case.
109
92
  */
110
- bool gumbo_string_equals_ignore_case(
111
- const GumboStringPiece* str1, const GumboStringPiece* str2);
93
+ bool gumbo_string_prefix_ignore_case (
94
+ const GumboStringPiece* prefix,
95
+ const GumboStringPiece* str
96
+ );
112
97
 
113
98
  /**
114
- * A simple vector implementation. This stores a pointer to a data array and a
115
- * length. All elements are stored as void*; client code must cast to the
116
- * appropriate type. Overflows upon addition result in reallocation of the data
117
- * array, with the size doubling to maintain O(1) amortized cost. There is no
118
- * removal function, as this isn't needed for any of the operations within this
119
- * library. Iteration can be done through inspecting the structure directly in
120
- * a for-loop.
99
+ * A simple vector implementation. This stores a pointer to a data array
100
+ * and a length. All elements are stored as `void*`; client code must
101
+ * cast to the appropriate type. Overflows upon addition result in
102
+ * reallocation of the data array, with the size doubling to maintain
103
+ * `O(1)` amortized cost. There is no removal function, as this isn't
104
+ * needed for any of the operations within this library. Iteration can
105
+ * be done through inspecting the structure directly in a `for` loop.
121
106
  */
122
107
  typedef struct {
123
- /** Data elements. This points to a dynamically-allocated array of capacity
124
- * elements, each a void* to the element itself.
108
+ /**
109
+ * Data elements. This points to a dynamically-allocated array of
110
+ * `capacity` elements, each a `void*` to the element itself.
125
111
  */
126
112
  void** data;
127
113
 
@@ -132,82 +118,230 @@ typedef struct {
132
118
  unsigned int capacity;
133
119
  } GumboVector;
134
120
 
135
- /** An empty (0-length, 0-capacity) GumboVector. */
136
- extern const GumboVector kGumboEmptyVector;
121
+ # define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
122
+ /** An empty (0-length, 0-capacity) `GumboVector`. */
123
+ #define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
137
124
 
138
125
  /**
139
- * Returns the first index at which an element appears in this vector (testing
140
- * by pointer equality), or -1 if it never does.
126
+ * Returns the first index at which an element appears in this vector
127
+ * (testing by pointer equality), or `-1` if it never does.
141
128
  */
142
129
  int gumbo_vector_index_of(GumboVector* vector, const void* element);
143
130
 
144
131
  /**
145
- * An enum for all the tags defined in the HTML5 standard. These correspond to
146
- * the tag names themselves. Enum constants exist only for tags which appear in
147
- * the spec itself (or for tags with special handling in the SVG and MathML
148
- * namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
149
- * name can be obtained through original_tag.
132
+ * An `enum` for all the tags defined in the HTML5 standard. These
133
+ * correspond to the tag names themselves. Enum constants exist only
134
+ * for tags that appear in the spec itself (or for tags with special
135
+ * handling in the SVG and MathML namespaces). Any other tags appear
136
+ * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
137
+ * through `original_tag`.
150
138
  *
151
- * This is mostly for API convenience, so that clients of this library don't
152
- * need to perform a strcasecmp to find the normalized tag name. It also has
153
- * efficiency benefits, by letting the parser work with enums instead of
154
- * strings.
139
+ * This is mostly for API convenience, so that clients of this library
140
+ * don't need to perform a `strcasecmp` to find the normalized tag
141
+ * name. It also has efficiency benefits, by letting the parser work
142
+ * with enums instead of strings.
155
143
  */
156
144
  typedef enum {
157
- // Load all the tags from an external source, generated from tag.in.
158
- #include "tag_enum.h"
159
- // Used for all tags that don't have special handling in HTML. Add new tags
160
- // to the end of tag.in so as to preserve backwards-compatibility.
145
+ GUMBO_TAG_HTML,
146
+ GUMBO_TAG_HEAD,
147
+ GUMBO_TAG_TITLE,
148
+ GUMBO_TAG_BASE,
149
+ GUMBO_TAG_LINK,
150
+ GUMBO_TAG_META,
151
+ GUMBO_TAG_STYLE,
152
+ GUMBO_TAG_SCRIPT,
153
+ GUMBO_TAG_NOSCRIPT,
154
+ GUMBO_TAG_TEMPLATE,
155
+ GUMBO_TAG_BODY,
156
+ GUMBO_TAG_ARTICLE,
157
+ GUMBO_TAG_SECTION,
158
+ GUMBO_TAG_NAV,
159
+ GUMBO_TAG_ASIDE,
160
+ GUMBO_TAG_H1,
161
+ GUMBO_TAG_H2,
162
+ GUMBO_TAG_H3,
163
+ GUMBO_TAG_H4,
164
+ GUMBO_TAG_H5,
165
+ GUMBO_TAG_H6,
166
+ GUMBO_TAG_HGROUP,
167
+ GUMBO_TAG_HEADER,
168
+ GUMBO_TAG_FOOTER,
169
+ GUMBO_TAG_ADDRESS,
170
+ GUMBO_TAG_P,
171
+ GUMBO_TAG_HR,
172
+ GUMBO_TAG_PRE,
173
+ GUMBO_TAG_BLOCKQUOTE,
174
+ GUMBO_TAG_OL,
175
+ GUMBO_TAG_UL,
176
+ GUMBO_TAG_LI,
177
+ GUMBO_TAG_DL,
178
+ GUMBO_TAG_DT,
179
+ GUMBO_TAG_DD,
180
+ GUMBO_TAG_FIGURE,
181
+ GUMBO_TAG_FIGCAPTION,
182
+ GUMBO_TAG_MAIN,
183
+ GUMBO_TAG_DIV,
184
+ GUMBO_TAG_A,
185
+ GUMBO_TAG_EM,
186
+ GUMBO_TAG_STRONG,
187
+ GUMBO_TAG_SMALL,
188
+ GUMBO_TAG_S,
189
+ GUMBO_TAG_CITE,
190
+ GUMBO_TAG_Q,
191
+ GUMBO_TAG_DFN,
192
+ GUMBO_TAG_ABBR,
193
+ GUMBO_TAG_DATA,
194
+ GUMBO_TAG_TIME,
195
+ GUMBO_TAG_CODE,
196
+ GUMBO_TAG_VAR,
197
+ GUMBO_TAG_SAMP,
198
+ GUMBO_TAG_KBD,
199
+ GUMBO_TAG_SUB,
200
+ GUMBO_TAG_SUP,
201
+ GUMBO_TAG_I,
202
+ GUMBO_TAG_B,
203
+ GUMBO_TAG_U,
204
+ GUMBO_TAG_MARK,
205
+ GUMBO_TAG_RUBY,
206
+ GUMBO_TAG_RT,
207
+ GUMBO_TAG_RP,
208
+ GUMBO_TAG_BDI,
209
+ GUMBO_TAG_BDO,
210
+ GUMBO_TAG_SPAN,
211
+ GUMBO_TAG_BR,
212
+ GUMBO_TAG_WBR,
213
+ GUMBO_TAG_INS,
214
+ GUMBO_TAG_DEL,
215
+ GUMBO_TAG_IMAGE,
216
+ GUMBO_TAG_IMG,
217
+ GUMBO_TAG_IFRAME,
218
+ GUMBO_TAG_EMBED,
219
+ GUMBO_TAG_OBJECT,
220
+ GUMBO_TAG_PARAM,
221
+ GUMBO_TAG_VIDEO,
222
+ GUMBO_TAG_AUDIO,
223
+ GUMBO_TAG_SOURCE,
224
+ GUMBO_TAG_TRACK,
225
+ GUMBO_TAG_CANVAS,
226
+ GUMBO_TAG_MAP,
227
+ GUMBO_TAG_AREA,
228
+ GUMBO_TAG_MATH,
229
+ GUMBO_TAG_MI,
230
+ GUMBO_TAG_MO,
231
+ GUMBO_TAG_MN,
232
+ GUMBO_TAG_MS,
233
+ GUMBO_TAG_MTEXT,
234
+ GUMBO_TAG_MGLYPH,
235
+ GUMBO_TAG_MALIGNMARK,
236
+ GUMBO_TAG_ANNOTATION_XML,
237
+ GUMBO_TAG_SVG,
238
+ GUMBO_TAG_FOREIGNOBJECT,
239
+ GUMBO_TAG_DESC,
240
+ GUMBO_TAG_TABLE,
241
+ GUMBO_TAG_CAPTION,
242
+ GUMBO_TAG_COLGROUP,
243
+ GUMBO_TAG_COL,
244
+ GUMBO_TAG_TBODY,
245
+ GUMBO_TAG_THEAD,
246
+ GUMBO_TAG_TFOOT,
247
+ GUMBO_TAG_TR,
248
+ GUMBO_TAG_TD,
249
+ GUMBO_TAG_TH,
250
+ GUMBO_TAG_FORM,
251
+ GUMBO_TAG_FIELDSET,
252
+ GUMBO_TAG_LEGEND,
253
+ GUMBO_TAG_LABEL,
254
+ GUMBO_TAG_INPUT,
255
+ GUMBO_TAG_BUTTON,
256
+ GUMBO_TAG_SELECT,
257
+ GUMBO_TAG_DATALIST,
258
+ GUMBO_TAG_OPTGROUP,
259
+ GUMBO_TAG_OPTION,
260
+ GUMBO_TAG_TEXTAREA,
261
+ GUMBO_TAG_KEYGEN,
262
+ GUMBO_TAG_OUTPUT,
263
+ GUMBO_TAG_PROGRESS,
264
+ GUMBO_TAG_METER,
265
+ GUMBO_TAG_DETAILS,
266
+ GUMBO_TAG_SUMMARY,
267
+ GUMBO_TAG_MENU,
268
+ GUMBO_TAG_MENUITEM,
269
+ GUMBO_TAG_APPLET,
270
+ GUMBO_TAG_ACRONYM,
271
+ GUMBO_TAG_BGSOUND,
272
+ GUMBO_TAG_DIR,
273
+ GUMBO_TAG_FRAME,
274
+ GUMBO_TAG_FRAMESET,
275
+ GUMBO_TAG_NOFRAMES,
276
+ GUMBO_TAG_LISTING,
277
+ GUMBO_TAG_XMP,
278
+ GUMBO_TAG_NEXTID,
279
+ GUMBO_TAG_NOEMBED,
280
+ GUMBO_TAG_PLAINTEXT,
281
+ GUMBO_TAG_RB,
282
+ GUMBO_TAG_STRIKE,
283
+ GUMBO_TAG_BASEFONT,
284
+ GUMBO_TAG_BIG,
285
+ GUMBO_TAG_BLINK,
286
+ GUMBO_TAG_CENTER,
287
+ GUMBO_TAG_FONT,
288
+ GUMBO_TAG_MARQUEE,
289
+ GUMBO_TAG_MULTICOL,
290
+ GUMBO_TAG_NOBR,
291
+ GUMBO_TAG_SPACER,
292
+ GUMBO_TAG_TT,
293
+ GUMBO_TAG_RTC,
294
+ GUMBO_TAG_DIALOG,
295
+ // Used for all tags that don't have special handling in HTML.
161
296
  GUMBO_TAG_UNKNOWN,
162
297
  // A marker value to indicate the end of the enum, for iterating over it.
163
- // Also used as the terminator for varargs functions that take tags.
164
298
  GUMBO_TAG_LAST,
165
299
  } GumboTag;
166
300
 
167
301
  /**
168
- * Returns the normalized (usually all-lowercased, except for foreign content)
169
- * tag name for an GumboTag enum. Return value is static data owned by the
170
- * library.
302
+ * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
303
+ * return value is static data owned by the library.
171
304
  */
172
305
  const char* gumbo_normalized_tagname(GumboTag tag);
173
306
 
174
307
  /**
175
- * Extracts the tag name from the original_text field of an element or token by
176
- * stripping off </> characters and attributes and adjusting the passed-in
177
- * GumboStringPiece appropriately. The tag name is in the original case and
178
- * shares a buffer with the original text, to simplify memory management.
179
- * Behavior is undefined if a string-piece that doesn't represent an HTML tag
180
- * (<tagname> or </tagname>) is passed in. If the string piece is completely
181
- * empty (NULL data pointer), then this function will exit successfully as a
182
- * no-op.
308
+ * Extracts the tag name from the `original_text` field of an element
309
+ * or token by stripping off `</>` characters and attributes and
310
+ * adjusting the passed-in `GumboStringPiece` appropriately. The tag
311
+ * name is in the original case and shares a buffer with the original
312
+ * text, to simplify memory management. Behavior is undefined if a
313
+ * string piece that doesn't represent an HTML tag (`<tagname>` or
314
+ * `</tagname>`) is passed in. If the string piece is completely
315
+ * empty (`NULL` data pointer), then this function will exit
316
+ * successfully as a no-op.
183
317
  */
184
318
  void gumbo_tag_from_original_text(GumboStringPiece* text);
185
319
 
186
320
  /**
187
- * Fixes the case of SVG elements that are not all lowercase.
188
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
189
- * This is not done at parse time because there's no place to store a mutated
190
- * tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
191
- * without special handling), while original_tag_name is a pointer into the
192
- * original buffer. Instead, we provide this helper function that clients can
193
- * use to rename SVG tags as appropriate.
194
- * Returns the case-normalized SVG tagname if a replacement is found, or NULL if
195
- * no normalization is called for. The return value is static data and owned by
196
- * the library.
321
+ * Fixes the case of SVG elements that are not all lowercase. This is
322
+ * not done at parse time because there's no place to store a mutated
323
+ * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
324
+ * SVG tags without special handling), while `original_tag_name` is a
325
+ * pointer into the original buffer. Instead, we provide this helper
326
+ * function that clients can use to rename SVG tags as appropriate.
327
+ * Returns the case-normalized SVG tagname if a replacement is found, or
328
+ * `NULL` if no normalization is called for. The return value is static
329
+ * data and owned by the library.
330
+ *
331
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
197
332
  */
198
333
  const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
199
334
 
200
335
  /**
201
- * Converts a tag name string (which may be in upper or mixed case) to a tag
202
- * enum. The `tag` version expects `tagname` to be NULL-terminated
336
+ * Converts a tag name string (which may be in upper or mixed case) to a
337
+ * tag enum.
203
338
  */
204
- GumboTag gumbo_tag_enum(const char* tagname);
205
- GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
339
+ GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
206
340
 
207
341
  /**
208
342
  * Attribute namespaces.
209
- * HTML includes special handling for XLink, XML, and XMLNS namespaces on
210
- * attributes. Everything else goes in the generic "NONE" namespace.
343
+ * HTML includes special handling for XLink, XML, and XMLNS namespaces
344
+ * on attributes. Everything else goes in the generic "NONE" namespace.
211
345
  */
212
346
  typedef enum {
213
347
  GUMBO_ATTR_NAMESPACE_NONE,
@@ -217,46 +351,47 @@ typedef enum {
217
351
  } GumboAttributeNamespaceEnum;
218
352
 
219
353
  /**
220
- * A struct representing a single attribute on an HTML tag. This is a
221
- * name-value pair, but also includes information about source locations and
222
- * original source text.
354
+ * A struct representing a single attribute on a HTML tag. This is a
355
+ * name-value pair, but also includes information about source locations
356
+ * and original source text.
223
357
  */
224
358
  typedef struct {
225
359
  /**
226
- * The namespace for the attribute. This will usually be
227
- * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
228
- * values, per:
229
- * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
360
+ * The namespace for the attribute. This will usually be
361
+ * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
362
+ * take special values, per:
363
+ * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
230
364
  */
231
365
  GumboAttributeNamespaceEnum attr_namespace;
232
366
 
233
367
  /**
234
- * The name of the attribute. This is in a freshly-allocated buffer to deal
235
- * with case-normalization, and is null-terminated.
368
+ * The name of the attribute. This is in a freshly-allocated buffer to
369
+ * deal with case-normalization and is null-terminated.
236
370
  */
237
371
  const char* name;
238
372
 
239
373
  /**
240
- * The original text of the attribute name, as a pointer into the original
241
- * source buffer.
374
+ * The original text of the attribute name, as a pointer into the
375
+ * original source buffer.
242
376
  */
243
377
  GumboStringPiece original_name;
244
378
 
245
379
  /**
246
- * The value of the attribute. This is in a freshly-allocated buffer to deal
247
- * with unescaping, and is null-terminated. It does not include any quotes
248
- * that surround the attribute. If the attribute has no value (for example,
249
- * 'selected' on a checkbox), this will be an empty string.
380
+ * The value of the attribute. This is in a freshly-allocated buffer
381
+ * to deal with unescaping and is null-terminated. It does not include
382
+ * any quotes that surround the attribute. If the attribute has no
383
+ * value (for example, `selected` on a checkbox) this will be an empty
384
+ * string.
250
385
  */
251
386
  const char* value;
252
387
 
253
388
  /**
254
- * The original text of the value of the attribute. This points into the
255
- * original source buffer. It includes any quotes that surround the
256
- * attribute, and you can look at original_value.data[0] and
257
- * original_value.data[original_value.length - 1] to determine what the quote
258
- * characters were. If the attribute has no value, this will be a 0-length
259
- * string.
389
+ * The original text of the value of the attribute. This points into
390
+ * the original source buffer. It includes any quotes that surround
391
+ * the attribute and you can look at `original_value.data[0]` and
392
+ * `original_value.data[original_value.length - 1]` to determine what
393
+ * the quote characters were. If the attribute has no value this will
394
+ * be a 0-length string.
260
395
  */
261
396
  GumboStringPiece original_value;
262
397
 
@@ -264,9 +399,9 @@ typedef struct {
264
399
  GumboSourcePosition name_start;
265
400
 
266
401
  /**
267
- * The ending position of the attribute name. This is not always derivable
402
+ * The ending position of the attribute name. This is not always derivable
268
403
  * from the starting position of the value because of the possibility of
269
- * whitespace around the = sign.
404
+ * whitespace around the `=` sign.
270
405
  */
271
406
  GumboSourcePosition name_end;
272
407
 
@@ -278,34 +413,37 @@ typedef struct {
278
413
  } GumboAttribute;
279
414
 
280
415
  /**
281
- * Given a vector of GumboAttributes, look up the one with the specified name
282
- * and return it, or NULL if no such attribute exists. This uses a
283
- * case-insensitive match, as HTML is case-insensitive.
416
+ * Given a vector of `GumboAttribute`s, look up the one with the
417
+ * specified name and return it, or `NULL` if no such attribute exists.
418
+ * This uses a case-insensitive match, as HTML is case-insensitive.
284
419
  */
285
420
  GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
286
421
 
287
422
  /**
288
- * Enum denoting the type of node. This determines the type of the node.v
289
- * union.
423
+ * Enum denoting the type of node. This determines the type of the
424
+ * `node.v` union.
290
425
  */
291
426
  typedef enum {
292
- /** Document node. v will be a GumboDocument. */
427
+ /** Document node. `v` will be a `GumboDocument`. */
293
428
  GUMBO_NODE_DOCUMENT,
294
- /** Element node. v will be a GumboElement. */
429
+ /** Element node. `v` will be a `GumboElement`. */
295
430
  GUMBO_NODE_ELEMENT,
296
- /** Text node. v will be a GumboText. */
431
+ /** Text node. `v` will be a `GumboText`. */
297
432
  GUMBO_NODE_TEXT,
298
- /** CDATA node. v will be a GumboText. */
433
+ /** CDATA node. `v` will be a `GumboText`. */
299
434
  GUMBO_NODE_CDATA,
300
- /** Comment node. v will be a GumboText, excluding comment delimiters. */
435
+ /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
301
436
  GUMBO_NODE_COMMENT,
302
- /** Text node, where all contents is whitespace. v will be a GumboText. */
437
+ /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
303
438
  GUMBO_NODE_WHITESPACE,
304
- /** Template node. This is separate from GUMBO_NODE_ELEMENT because many
305
- * client libraries will want to ignore the contents of template nodes, as
306
- * the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing
307
- * here, while clients that want to include template contents should also
308
- * check for GUMBO_NODE_TEMPLATE. v will be a GumboElement. */
439
+ /**
440
+ * Template node. This is separate from `GUMBO_NODE_ELEMENT` because
441
+ * many client libraries will want to ignore the contents of template
442
+ * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
443
+ * do the right thing here, while clients that want to include template
444
+ * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
445
+ * `GumboElement`.
446
+ */
309
447
  GUMBO_NODE_TEMPLATE
310
448
  } GumboNodeType;
311
449
 
@@ -315,9 +453,7 @@ typedef enum {
315
453
  */
316
454
  typedef struct GumboInternalNode GumboNode;
317
455
 
318
- /**
319
- * http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
320
- */
456
+ /** https://dom.spec.whatwg.org/#concept-document-quirks */
321
457
  typedef enum {
322
458
  GUMBO_DOCTYPE_NO_QUIRKS,
323
459
  GUMBO_DOCTYPE_QUIRKS,
@@ -326,10 +462,11 @@ typedef enum {
326
462
 
327
463
  /**
328
464
  * Namespaces.
329
- * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
330
- * anything inside an <svg> tag is in the SVG namespace, anything inside the
331
- * <math> tag is in the MathML namespace, and anything else is inside the HTML
332
- * namespace. No other namespaces are supported, so this can be an enum only.
465
+ * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
466
+ * Rather, anything inside an `<svg>` tag is in the SVG namespace,
467
+ * anything inside the `<math>` tag is in the MathML namespace, and
468
+ * anything else is inside the HTML namespace. No other namespaces are
469
+ * supported, so this can be an `enum`.
333
470
  */
334
471
  typedef enum {
335
472
  GUMBO_NAMESPACE_HTML,
@@ -339,66 +476,70 @@ typedef enum {
339
476
 
340
477
  /**
341
478
  * Parse flags.
342
- * We track the reasons for parser insertion of nodes and store them in a
343
- * bitvector in the node itself. This lets client code optimize out nodes that
344
- * are implied by the HTML structure of the document, or flag constructs that
345
- * may not be allowed by a style guide, or track the prevalence of incorrect or
346
- * tricky HTML code.
479
+ * We track the reasons for parser insertion of nodes and store them in
480
+ * a bitvector in the node itself. This lets client code optimize out
481
+ * nodes that are implied by the HTML structure of the document, or flag
482
+ * constructs that may not be allowed by a style guide, or track the
483
+ * prevalence of incorrect or tricky HTML code.
347
484
  */
348
485
  typedef enum {
349
486
  /**
350
- * A normal node - both start and end tags appear in the source, nothing has
351
- * been reparented.
487
+ * A normal node -- both start and end tags appear in the source,
488
+ * nothing has been reparented.
352
489
  */
353
490
  GUMBO_INSERTION_NORMAL = 0,
354
491
 
355
492
  /**
356
- * A node inserted by the parser to fulfill some implicit insertion rule.
357
- * This is usually set in addition to some other flag giving a more specific
358
- * insertion reason; it's a generic catch-all term meaning "The start tag for
359
- * this node did not appear in the document source".
493
+ * A node inserted by the parser to fulfill some implicit insertion
494
+ * rule. This is usually set in addition to some other flag giving a
495
+ * more specific insertion reason; it's a generic catch-all term
496
+ * meaning "The start tag for this node did not appear in the document
497
+ * source".
360
498
  */
361
499
  GUMBO_INSERTION_BY_PARSER = 1 << 0,
362
500
 
363
501
  /**
364
- * A flag indicating that the end tag for this node did not appear in the
365
- * document source. Note that in some cases, you can still have
366
- * parser-inserted nodes with an explicit end tag: for example, "Text</html>"
367
- * has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
368
- * GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
369
- * exists. This flag will be set only if the end tag is completely missing;
370
- * in some cases, the end tag may be misplaced (eg. a </body> tag with text
371
- * afterwards), which will leave this flag unset and require clients to
372
- * inspect the parse errors for that case.
502
+ * A flag indicating that the end tag for this node did not appear in
503
+ * the document source. Note that in some cases, you can still have
504
+ * parser-inserted nodes with an explicit end tag. For example,
505
+ * `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
506
+ * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
507
+ * `</html>` tag actually exists.
508
+ *
509
+ * This flag will be set only if the end tag is completely missing.
510
+ * In some cases, the end tag may be misplaced (e.g. a `</body>` tag
511
+ * with text afterwards), which will leave this flag unset and require
512
+ * clients to inspect the parse errors for that case.
373
513
  */
374
514
  GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
375
515
 
376
516
  // Value 1 << 2 was for a flag that has since been removed.
377
517
 
378
518
  /**
379
- * A flag for nodes that are inserted because their presence is implied by
380
- * other tags, eg. <html>, <head>, <body>, <tbody>, etc.
519
+ * A flag for nodes that are inserted because their presence is
520
+ * implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
521
+ * `<tbody>`, etc.
381
522
  */
382
523
  GUMBO_INSERTION_IMPLIED = 1 << 3,
383
524
 
384
525
  /**
385
- * A flag for nodes that are converted from their end tag equivalents. For
386
- * example, </p> when no paragraph is open implies that the parser should
387
- * create a <p> tag and immediately close it, while </br> means the same thing
388
- * as <br>.
526
+ * A flag for nodes that are converted from their end tag equivalents.
527
+ * For example, `</p>` when no paragraph is open implies that the
528
+ * parser should create a `<p>` tag and immediately close it, while
529
+ * `</br>` means the same thing as `<br>`.
389
530
  */
390
531
  GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
391
532
 
392
- /** A flag for nodes that are converted from the parse of an <isindex> tag. */
393
- GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
533
+ // Value 1 << 5 was for a flag that has since been removed.
394
534
 
395
- /** A flag for <image> tags that are rewritten as <img>. */
535
+ /** A flag for `<image>` tags that are rewritten as `<img>`. */
396
536
  GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
397
537
 
398
538
  /**
399
- * A flag for nodes that are cloned as a result of the reconstruction of
400
- * active formatting elements. This is set only on the clone; the initial
401
- * portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
539
+ * A flag for nodes that are cloned as a result of the reconstruction
540
+ * of active formatting elements. This is set only on the clone; the
541
+ * initial portion of the formatting run is a NORMAL node with an
542
+ * `IMPLICIT_END_TAG`.
402
543
  */
403
544
  GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
404
545
 
@@ -415,18 +556,19 @@ typedef enum {
415
556
  GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
416
557
  } GumboParseFlags;
417
558
 
418
- /**
419
- * Information specific to document nodes.
420
- */
559
+ /** Information specific to document nodes. */
421
560
  typedef struct {
422
561
  /**
423
- * An array of GumboNodes, containing the children of this element. This will
424
- * normally consist of the <html> element and any comment nodes found.
425
- * Pointers are owned.
562
+ * An array of `GumboNode`s, containing the children of this element.
563
+ * This will normally consist of the `<html>` element and any comment
564
+ * nodes found. Pointers are owned.
426
565
  */
427
566
  GumboVector /* GumboNode* */ children;
428
567
 
429
- // True if there was an explicit doctype token as opposed to it being omitted.
568
+ /**
569
+ * `true` if there was an explicit doctype token, as opposed to it
570
+ * being omitted.
571
+ */
430
572
  bool has_doctype;
431
573
 
432
574
  // Fields from the doctype token, copied verbatim.
@@ -435,65 +577,70 @@ typedef struct {
435
577
  const char* system_identifier;
436
578
 
437
579
  /**
438
- * Whether or not the document is in QuirksMode, as determined by the values
439
- * in the GumboTokenDocType template.
580
+ * Whether or not the document is in QuirksMode, as determined by the
581
+ * values in the GumboTokenDocType template.
440
582
  */
441
583
  GumboQuirksModeEnum doc_type_quirks_mode;
442
584
  } GumboDocument;
443
585
 
444
586
  /**
445
- * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
446
- * This contains just a block of text and its position.
587
+ * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
588
+ * elements. This contains just a block of text and its position.
447
589
  */
448
590
  typedef struct {
449
591
  /**
450
- * The text of this node, after entities have been parsed and decoded. For
451
- * comment/cdata nodes, this does not include the comment delimiters.
592
+ * The text of this node, after entities have been parsed and decoded.
593
+ * For comment and cdata nodes, this does not include the comment
594
+ * delimiters.
452
595
  */
453
596
  const char* text;
454
597
 
455
598
  /**
456
- * The original text of this node, as a pointer into the original buffer. For
457
- * comment/cdata nodes, this includes the comment delimiters.
599
+ * The original text of this node, as a pointer into the original
600
+ * buffer. For comment/cdata nodes, this includes the comment
601
+ * delimiters.
458
602
  */
459
603
  GumboStringPiece original_text;
460
604
 
461
605
  /**
462
- * The starting position of this node. This corresponds to the position of
463
- * original_text, before entities are decoded.
606
+ * The starting position of this node. This corresponds to the
607
+ * position of `original_text`, before entities are decoded.
464
608
  * */
465
609
  GumboSourcePosition start_pos;
466
610
  } GumboText;
467
611
 
468
612
  /**
469
- * The struct used to represent all HTML elements. This contains information
470
- * about the tag, attributes, and child nodes.
613
+ * The struct used to represent all HTML elements. This contains
614
+ * information about the tag, attributes, and child nodes.
471
615
  */
472
616
  typedef struct {
473
617
  /**
474
- * An array of GumboNodes, containing the children of this element. Pointers
475
- * are owned.
618
+ * An array of `GumboNode`s, containing the children of this element.
619
+ * Pointers are owned.
476
620
  */
477
621
  GumboVector /* GumboNode* */ children;
478
622
 
479
623
  /** The GumboTag enum for this element. */
480
624
  GumboTag tag;
481
625
 
626
+ /** The name for this element. */
627
+ const char* name;
628
+
482
629
  /** The GumboNamespaceEnum for this element. */
483
630
  GumboNamespaceEnum tag_namespace;
484
631
 
485
632
  /**
486
- * A GumboStringPiece pointing to the original tag text for this element,
487
- * pointing directly into the source buffer. If the tag was inserted
488
- * algorithmically (for example, <head> or <tbody> insertion), this will be a
489
- * zero-length string.
633
+ * A `GumboStringPiece` pointing to the original tag text for this
634
+ * element, pointing directly into the source buffer. If the tag was
635
+ * inserted algorithmically (for example, `<head>` or `<tbody>`
636
+ * insertion), this will be a zero-length string.
490
637
  */
491
638
  GumboStringPiece original_tag;
492
639
 
493
640
  /**
494
- * A GumboStringPiece pointing to the original end tag text for this element.
495
- * If the end tag was inserted algorithmically, (for example, closing a
496
- * self-closing tag), this will be a zero-length string.
641
+ * A `GumboStringPiece` pointing to the original end tag text for this
642
+ * element. If the end tag was inserted algorithmically, (for example,
643
+ * closing a self-closing tag), this will be a zero-length string.
497
644
  */
498
645
  GumboStringPiece original_end_tag;
499
646
 
@@ -504,30 +651,31 @@ typedef struct {
504
651
  GumboSourcePosition end_pos;
505
652
 
506
653
  /**
507
- * An array of GumboAttributes, containing the attributes for this tag in the
508
- * order that they were parsed. Pointers are owned.
654
+ * An array of `GumboAttribute`s, containing the attributes for this
655
+ * tag in the order that they were parsed. Pointers are owned.
509
656
  */
510
657
  GumboVector /* GumboAttribute* */ attributes;
511
658
  } GumboElement;
512
659
 
513
660
  /**
514
- * A supertype for GumboElement and GumboText, so that we can include one
515
- * generic type in lists of children and cast as necessary to subtypes.
661
+ * A supertype for `GumboElement` and `GumboText`, so that we can
662
+ * include one generic type in lists of children and cast as necessary
663
+ * to subtypes.
516
664
  */
517
665
  struct GumboInternalNode {
518
666
  /** The type of node that this is. */
519
667
  GumboNodeType type;
520
668
 
521
- /** Pointer back to parent node. Not owned. */
669
+ /** Pointer back to parent node. Not owned. */
522
670
  GumboNode* parent;
523
671
 
524
672
  /** The index within the parent's children vector of this node. */
525
- size_t index_within_parent;
673
+ unsigned int index_within_parent;
526
674
 
527
675
  /**
528
- * A bitvector of flags containing information about why this element was
529
- * inserted into the parse tree, including a variety of special parse
530
- * situations.
676
+ * A bitvector of flags containing information about why this element
677
+ * was inserted into the parse tree, including a variety of special
678
+ * parse situations.
531
679
  */
532
680
  GumboParseFlags parse_flags;
533
681
 
@@ -539,133 +687,187 @@ struct GumboInternalNode {
539
687
  } v;
540
688
  };
541
689
 
542
- /**
543
- * The type for an allocator function. Takes the 'userdata' member of the
544
- * GumboParser struct as its first argument. Semantics should be the same as
545
- * malloc, i.e. return a block of size_t bytes on success or NULL on failure.
546
- * Allocating a block of 0 bytes behaves as per malloc.
547
- */
548
- // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
549
- typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
550
-
551
- /**
552
- * The type for a deallocator function. Takes the 'userdata' member of the
553
- * GumboParser struct as its first argument.
554
- */
555
- typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
556
-
557
690
  /**
558
691
  * Input struct containing configuration options for the parser.
559
- * These let you specify alternate memory managers, provide different error
560
- * handling, etc.
561
- * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
692
+ * These let you specify alternate memory managers, provide different
693
+ * error handling, etc. Use `kGumboDefaultOptions` for sensible
694
+ * defaults and only set what you need.
562
695
  */
563
696
  typedef struct GumboInternalOptions {
564
- /** A memory allocator function. Default: malloc. */
565
- GumboAllocatorFunction allocator;
566
-
567
- /** A memory deallocator function. Default: free. */
568
- GumboDeallocatorFunction deallocator;
569
-
570
697
  /**
571
- * An opaque object that's passed in as the first argument to all callbacks
572
- * used by this library. Default: NULL.
573
- */
574
- void* userdata;
575
-
576
- /**
577
- * The tab-stop size, for computing positions in source code that uses tabs.
578
- * Default: 8.
698
+ * The tab-stop size, for computing positions in HTML files that
699
+ * use tabs. Default: `8`.
579
700
  */
580
701
  int tab_stop;
581
702
 
582
703
  /**
583
704
  * Whether or not to stop parsing when the first error is encountered.
584
- * Default: false.
705
+ * Default: `false`.
585
706
  */
586
707
  bool stop_on_first_error;
587
708
 
588
709
  /**
589
- * The maximum number of errors before the parser stops recording them. This
590
- * is provided so that if the page is totally borked, we don't completely fill
591
- * up the errors vector and exhaust memory with useless redundant errors. Set
592
- * to -1 to disable the limit.
593
- * Default: -1
710
+ * Maximum allowed depth for the parse tree. If this limit is exceeded,
711
+ * the parser will return early with a partial document and the returned
712
+ * `GumboOutput` will have its `status` field set to
713
+ * `GUMBO_STATUS_TREE_TOO_DEEP`.
714
+ * Default: `400`.
715
+ */
716
+ unsigned int max_tree_depth;
717
+
718
+ /**
719
+ * The maximum number of errors before the parser stops recording
720
+ * them. This is provided so that if the page is totally borked, we
721
+ * don't completely fill up the errors vector and exhaust memory with
722
+ * useless redundant errors. Set to `-1` to disable the limit.
723
+ * Default: `-1`.
594
724
  */
595
725
  int max_errors;
596
726
 
597
727
  /**
598
728
  * The fragment context for parsing:
599
- * https://html.spec.whatwg.org/multipage/syntax.html#parsing-html-fragments
729
+ * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
600
730
  *
601
- * If GUMBO_TAG_LAST is passed here, it is assumed to be "no fragment", i.e.
602
- * the regular parsing algorithm. Otherwise, pass the tag enum for the
603
- * intended parent of the parsed fragment. We use just the tag enum rather
604
- * than a full node because that's enough to set all the parsing context we
605
- * need, and it provides some additional flexibility for client code to act as
606
- * if parsing a fragment even when a full HTML tree isn't available.
731
+ * If `NULL` is passed here, it is assumed to be "no
732
+ * fragment", i.e. the regular parsing algorithm. Otherwise, pass the
733
+ * tag name for the intended parent of the parsed fragment. We use the
734
+ * tag name, namespace, and encoding attribute which are sufficient to
735
+ * set all of the parsing context needed for fragment parsing.
607
736
  *
608
- * Default: GUMBO_TAG_LAST
737
+ * Default: `NULL`.
609
738
  */
610
- GumboTag fragment_context;
739
+ const char* fragment_context;
611
740
 
612
741
  /**
613
- * The namespace for the fragment context. This lets client code
614
- * differentiate between, say, parsing a <title> tag in SVG vs. parsing it in
615
- * HTML.
616
- * Default: GUMBO_NAMESPACE_HTML
742
+ * The namespace for the fragment context. This lets client code
743
+ * differentiate between, say, parsing a `<title>` tag in SVG vs.
744
+ * parsing it in HTML.
745
+ *
746
+ * Default: `GUMBO_NAMESPACE_HTML`.
617
747
  */
618
748
  GumboNamespaceEnum fragment_namespace;
749
+
750
+ /**
751
+ * The value of the fragment context's `encoding` attribute, if any.
752
+ * Set to `NULL` for no `encoding` attribute.
753
+ *
754
+ * Default: `NULL`.
755
+ */
756
+ const char* fragment_encoding;
757
+
758
+ /**
759
+ * Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
760
+ * be looked up using `gumbo_compute_quirks_mode()`.
761
+ *
762
+ * Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
763
+ */
764
+ GumboQuirksModeEnum quirks_mode;
765
+
766
+ /**
767
+ * For fragment parsing. Set this to true if the context node has a form
768
+ * element as an ancestor.
769
+ *
770
+ * Default: `false`.
771
+ */
772
+ bool fragment_context_has_form_ancestor;
619
773
  } GumboOptions;
620
774
 
621
775
  /** Default options struct; use this with gumbo_parse_with_options. */
622
776
  extern const GumboOptions kGumboDefaultOptions;
623
777
 
778
+ /**
779
+ * Status code indicating whether parsing finished successfully or
780
+ * was stopped mid-document due to exceptional circumstances.
781
+ */
782
+ typedef enum {
783
+ /**
784
+ * Indicates that parsing completed successfuly. The resulting tree
785
+ * will be a complete document.
786
+ */
787
+ GUMBO_STATUS_OK,
788
+
789
+ /**
790
+ * Indicates that the maximum element nesting limit
791
+ * (`GumboOptions::max_tree_depth`) was reached during parsing. The
792
+ * resulting tree will be a partial document, with no further nodes
793
+ * created after the point where the limit was reached. The partial
794
+ * document may be useful for constructing an error message but
795
+ * typically shouldn't be used for other purposes.
796
+ */
797
+ GUMBO_STATUS_TREE_TOO_DEEP,
798
+
799
+ // Currently unused
800
+ GUMBO_STATUS_OUT_OF_MEMORY,
801
+ } GumboOutputStatus;
802
+
803
+
624
804
  /** The output struct containing the results of the parse. */
625
805
  typedef struct GumboInternalOutput {
626
806
  /**
627
- * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
628
- * that contains the entire document as its child.
807
+ * Pointer to the document node. This is a `GumboNode` of type
808
+ * `NODE_DOCUMENT` that contains the entire document as its child.
629
809
  */
630
810
  GumboNode* document;
631
811
 
632
812
  /**
633
- * Pointer to the root node. This the <html> tag that forms the root of the
634
- * document.
813
+ * Pointer to the root node. This is the `<html>` tag that forms the
814
+ * root of the document.
635
815
  */
636
816
  GumboNode* root;
637
817
 
638
818
  /**
639
819
  * A list of errors that occurred during the parse.
640
820
  * NOTE: In version 1.0 of this library, the API for errors hasn't been fully
641
- * fleshed out and may change in the future. For this reason, the GumboError
642
- * header isn't part of the public API. Contact us if you need errors
821
+ * fleshed out and may change in the future. For this reason, the GumboError
822
+ * header isn't part of the public API. Contact us if you need errors
643
823
  * reported so we can work out something appropriate for your use-case.
644
824
  */
645
825
  GumboVector /* GumboError */ errors;
826
+
827
+ /**
828
+ * A status code indicating whether parsing finished successfully or was
829
+ * stopped mid-document due to exceptional circumstances.
830
+ */
831
+ GumboOutputStatus status;
646
832
  } GumboOutput;
647
833
 
648
834
  /**
649
- * Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
650
- * live at least as long as the parse tree, as some fields (eg. original_text)
651
- * point directly into the original buffer.
835
+ * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
836
+ * buffer must live at least as long as the parse tree, as some fields
837
+ * (eg. `original_text`) point directly into the original buffer.
652
838
  *
653
839
  * This doesn't support buffers longer than 4 gigabytes.
654
840
  */
655
841
  GumboOutput* gumbo_parse(const char* buffer);
656
842
 
657
843
  /**
658
- * Extended version of gumbo_parse that takes an explicit options structure,
659
- * buffer, and length.
844
+ * Extended version of `gumbo_parse` that takes an explicit options
845
+ * structure, buffer, and length.
660
846
  */
661
- GumboOutput* gumbo_parse_with_options(
662
- const GumboOptions* options, const char* buffer, size_t buffer_length);
847
+ GumboOutput* gumbo_parse_with_options (
848
+ const GumboOptions* options,
849
+ const char* buffer,
850
+ size_t buffer_length
851
+ );
852
+
853
+ /**
854
+ * Compute the quirks mode based on the name, public identifier, and system
855
+ * identifier. Any of these may be `NULL` to indicate a missing value.
856
+ */
857
+ GumboQuirksModeEnum gumbo_compute_quirks_mode (
858
+ const char *name,
859
+ const char *pubid,
860
+ const char *sysid
861
+ );
862
+
863
+ /** Convert a `GumboOutputStatus` code into a readable description. */
864
+ const char* gumbo_status_to_string(GumboOutputStatus status);
663
865
 
664
- /** Release the memory used for the parse tree & parse errors. */
665
- void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);
866
+ /** Release the memory used for the parse tree and parse errors. */
867
+ void gumbo_destroy_output(GumboOutput* output);
666
868
 
667
869
  #ifdef __cplusplus
668
870
  }
669
871
  #endif
670
872
 
671
- #endif // GUMBO_GUMBO_H_
873
+ #endif // GUMBO_H