nokogumbo 1.5.0 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +237 -26
- data/ext/nokogumbo/extconf.rb +121 -0
- data/ext/nokogumbo/nokogumbo.c +793 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +5972 -6816
- data/gumbo-parser/src/char_ref.h +14 -45
- data/gumbo-parser/src/error.c +510 -163
- data/gumbo-parser/src/error.h +70 -147
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +577 -305
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +2922 -2228
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +2127 -1561
- data/gumbo-parser/src/tokenizer.h +41 -52
- data/gumbo-parser/src/tokenizer_states.h +281 -45
- data/gumbo-parser/src/utf8.c +98 -123
- data/gumbo-parser/src/utf8.h +84 -52
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +11 -173
- data/lib/nokogumbo/html5.rb +252 -0
- data/lib/nokogumbo/html5/document.rb +53 -0
- data/lib/nokogumbo/html5/document_fragment.rb +62 -0
- data/lib/nokogumbo/html5/node.rb +72 -0
- data/lib/nokogumbo/version.rb +3 -0
- metadata +43 -24
- data/ext/nokogumboc/extconf.rb +0 -60
- data/ext/nokogumboc/nokogumbo.c +0 -295
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
data/gumbo-parser/src/error.h
CHANGED
@@ -1,32 +1,13 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
//
|
17
|
-
// Error types, enums, and handling functions.
|
18
|
-
|
19
1
|
#ifndef GUMBO_ERROR_H_
|
20
2
|
#define GUMBO_ERROR_H_
|
21
|
-
|
22
|
-
#define _CRT_SECURE_NO_WARNINGS
|
23
|
-
#endif
|
3
|
+
|
24
4
|
#include <stdint.h>
|
25
5
|
|
26
6
|
#include "gumbo.h"
|
27
7
|
#include "insertion_mode.h"
|
28
8
|
#include "string_buffer.h"
|
29
9
|
#include "token_type.h"
|
10
|
+
#include "tokenizer_states.h"
|
30
11
|
|
31
12
|
#ifdef __cplusplus
|
32
13
|
extern "C" {
|
@@ -35,84 +16,66 @@ extern "C" {
|
|
35
16
|
struct GumboInternalParser;
|
36
17
|
|
37
18
|
typedef enum {
|
19
|
+
// Defined errors.
|
20
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
|
21
|
+
GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
|
22
|
+
GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
|
23
|
+
GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
|
24
|
+
GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
|
25
|
+
GUMBO_ERR_CDATA_IN_HTML_CONTENT,
|
26
|
+
GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
|
27
|
+
GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
|
28
|
+
GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
|
29
|
+
GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
|
30
|
+
GUMBO_ERR_DUPLICATE_ATTRIBUTE,
|
31
|
+
GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
|
32
|
+
GUMBO_ERR_EOF_BEFORE_TAG_NAME,
|
33
|
+
GUMBO_ERR_EOF_IN_CDATA,
|
34
|
+
GUMBO_ERR_EOF_IN_COMMENT,
|
35
|
+
GUMBO_ERR_EOF_IN_DOCTYPE,
|
36
|
+
GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
|
37
|
+
GUMBO_ERR_EOF_IN_TAG,
|
38
|
+
GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
|
39
|
+
GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
|
40
|
+
GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
|
41
|
+
GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
|
42
|
+
GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
|
43
|
+
GUMBO_ERR_MISSING_DOCTYPE_NAME,
|
44
|
+
GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
|
45
|
+
GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
|
46
|
+
GUMBO_ERR_MISSING_END_TAG_NAME,
|
47
|
+
GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
|
48
|
+
GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
|
49
|
+
GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
|
50
|
+
GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
|
51
|
+
GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
|
52
|
+
GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
|
53
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
|
54
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
|
55
|
+
GUMBO_ERR_NESTED_COMMENT,
|
56
|
+
GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
|
57
|
+
GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
|
58
|
+
GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
|
59
|
+
GUMBO_ERR_NULL_CHARACTER_REFERENCE,
|
60
|
+
GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
|
61
|
+
GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
|
62
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
|
63
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
|
64
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
|
65
|
+
GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
|
66
|
+
GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
|
67
|
+
GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
|
68
|
+
GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
|
69
|
+
GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
|
70
|
+
|
71
|
+
// Encoding errors.
|
38
72
|
GUMBO_ERR_UTF8_INVALID,
|
39
73
|
GUMBO_ERR_UTF8_TRUNCATED,
|
40
|
-
|
41
|
-
|
42
|
-
GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
|
43
|
-
GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
|
44
|
-
GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
|
45
|
-
GUMBO_ERR_NAMED_CHAR_REF_INVALID,
|
46
|
-
GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
|
47
|
-
GUMBO_ERR_TAG_EOF,
|
48
|
-
GUMBO_ERR_TAG_INVALID,
|
49
|
-
GUMBO_ERR_CLOSE_TAG_EMPTY,
|
50
|
-
GUMBO_ERR_CLOSE_TAG_EOF,
|
51
|
-
GUMBO_ERR_CLOSE_TAG_INVALID,
|
52
|
-
GUMBO_ERR_SCRIPT_EOF,
|
53
|
-
GUMBO_ERR_ATTR_NAME_EOF,
|
54
|
-
GUMBO_ERR_ATTR_NAME_INVALID,
|
55
|
-
GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
|
56
|
-
GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
|
57
|
-
GUMBO_ERR_ATTR_UNQUOTED_EOF,
|
58
|
-
GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
|
59
|
-
GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
|
60
|
-
GUMBO_ERR_ATTR_AFTER_EOF,
|
61
|
-
GUMBO_ERR_ATTR_AFTER_INVALID,
|
62
|
-
GUMBO_ERR_DUPLICATE_ATTR,
|
63
|
-
GUMBO_ERR_SOLIDUS_EOF,
|
64
|
-
GUMBO_ERR_SOLIDUS_INVALID,
|
65
|
-
GUMBO_ERR_DASHES_OR_DOCTYPE,
|
66
|
-
GUMBO_ERR_COMMENT_EOF,
|
67
|
-
GUMBO_ERR_COMMENT_INVALID,
|
68
|
-
GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
|
69
|
-
GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
|
70
|
-
GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
|
71
|
-
GUMBO_ERR_COMMENT_END_BANG_EOF,
|
72
|
-
GUMBO_ERR_DOCTYPE_EOF,
|
73
|
-
GUMBO_ERR_DOCTYPE_INVALID,
|
74
|
-
GUMBO_ERR_DOCTYPE_SPACE,
|
75
|
-
GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
|
76
|
-
GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
|
77
|
-
GUMBO_ERR_DOCTYPE_END,
|
74
|
+
|
75
|
+
// Generic parser error.
|
78
76
|
GUMBO_ERR_PARSER,
|
79
|
-
GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
|
80
77
|
} GumboErrorType;
|
81
78
|
|
82
|
-
// Additional data for duplicated attributes.
|
83
|
-
typedef struct GumboInternalDuplicateAttrError {
|
84
|
-
// The name of the attribute. Owned by this struct.
|
85
|
-
const char* name;
|
86
|
-
|
87
|
-
// The (0-based) index within the attributes vector of the original
|
88
|
-
// occurrence.
|
89
|
-
unsigned int original_index;
|
90
|
-
|
91
|
-
// The (0-based) index where the new occurrence would be.
|
92
|
-
unsigned int new_index;
|
93
|
-
} GumboDuplicateAttrError;
|
94
|
-
|
95
|
-
// A simplified representation of the tokenizer state, designed to be more
|
96
|
-
// useful to clients of this library than the internal representation. This
|
97
|
-
// condenses the actual states used in the tokenizer state machine into a few
|
98
|
-
// values that will be familiar to users of HTML.
|
99
|
-
typedef enum {
|
100
|
-
GUMBO_ERR_TOKENIZER_DATA,
|
101
|
-
GUMBO_ERR_TOKENIZER_CHAR_REF,
|
102
|
-
GUMBO_ERR_TOKENIZER_RCDATA,
|
103
|
-
GUMBO_ERR_TOKENIZER_RAWTEXT,
|
104
|
-
GUMBO_ERR_TOKENIZER_PLAINTEXT,
|
105
|
-
GUMBO_ERR_TOKENIZER_SCRIPT,
|
106
|
-
GUMBO_ERR_TOKENIZER_TAG,
|
107
|
-
GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
|
108
|
-
GUMBO_ERR_TOKENIZER_ATTR_NAME,
|
109
|
-
GUMBO_ERR_TOKENIZER_ATTR_VALUE,
|
110
|
-
GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
|
111
|
-
GUMBO_ERR_TOKENIZER_COMMENT,
|
112
|
-
GUMBO_ERR_TOKENIZER_DOCTYPE,
|
113
|
-
GUMBO_ERR_TOKENIZER_CDATA,
|
114
|
-
} GumboTokenizerErrorState;
|
115
|
-
|
116
79
|
// Additional data for tokenizer errors.
|
117
80
|
// This records the current state and codepoint encountered - this is usually
|
118
81
|
// enough to reconstruct what went wrong and provide a friendly error message.
|
@@ -121,7 +84,7 @@ typedef struct GumboInternalTokenizerError {
|
|
121
84
|
int codepoint;
|
122
85
|
|
123
86
|
// The state that the tokenizer was in at the time.
|
124
|
-
|
87
|
+
GumboTokenizerEnum state;
|
125
88
|
} GumboTokenizerError;
|
126
89
|
|
127
90
|
// Additional data for parse errors.
|
@@ -129,61 +92,43 @@ typedef struct GumboInternalParserError {
|
|
129
92
|
// The type of input token that resulted in this error.
|
130
93
|
GumboTokenType input_type;
|
131
94
|
|
132
|
-
// The HTML tag of the input token.
|
95
|
+
// The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
|
133
96
|
GumboTag input_tag;
|
134
97
|
|
135
98
|
// The insertion mode that the parser was in at the time.
|
136
99
|
GumboInsertionMode parser_state;
|
137
100
|
|
138
|
-
// The tag stack at the point of the error.
|
101
|
+
// The tag stack at the point of the error. Note that this is an GumboVector
|
139
102
|
// of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
|
140
103
|
// get at the tag.
|
141
104
|
GumboVector /* GumboTag */ tag_stack;
|
142
105
|
} GumboParserError;
|
143
106
|
|
144
107
|
// The overall error struct representing an error in decoding/tokenizing/parsing
|
145
|
-
// the HTML.
|
108
|
+
// the HTML. This contains an enumerated type flag, a source position, and then
|
146
109
|
// a union of fields containing data specific to the error.
|
147
|
-
|
110
|
+
struct GumboInternalError {
|
148
111
|
// The type of error.
|
149
112
|
GumboErrorType type;
|
150
113
|
|
151
114
|
// The position within the source file where the error occurred.
|
152
115
|
GumboSourcePosition position;
|
153
116
|
|
154
|
-
//
|
155
|
-
|
156
|
-
// character-based instead of byte-based offsets).
|
157
|
-
const char* original_text;
|
117
|
+
// The piece of text that caused the error.
|
118
|
+
GumboStringPiece original_text;
|
158
119
|
|
159
120
|
// Type-specific error information.
|
160
121
|
union {
|
161
|
-
// The code point we encountered, for:
|
162
|
-
// * GUMBO_ERR_UTF8_INVALID
|
163
|
-
// * GUMBO_ERR_UTF8_TRUNCATED
|
164
|
-
// * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
|
165
|
-
// * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
|
166
|
-
uint64_t codepoint;
|
167
|
-
|
168
122
|
// Tokenizer errors.
|
169
123
|
GumboTokenizerError tokenizer;
|
170
124
|
|
171
|
-
//
|
172
|
-
|
173
|
-
// * GUMBO_ERR_NAMED_CHAR_REF_INVALID
|
174
|
-
GumboStringPiece text;
|
175
|
-
|
176
|
-
// Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
|
177
|
-
GumboDuplicateAttrError duplicate_attr;
|
178
|
-
|
179
|
-
// Parser state, for GUMBO_ERR_PARSER and
|
180
|
-
// GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
|
181
|
-
struct GumboInternalParserError parser;
|
125
|
+
// Parser errors.
|
126
|
+
GumboParserError parser;
|
182
127
|
} v;
|
183
|
-
}
|
128
|
+
};
|
184
129
|
|
185
130
|
// Adds a new error to the parser's error list, and returns a pointer to it so
|
186
|
-
// that clients can fill out the rest of its fields.
|
131
|
+
// that clients can fill out the rest of its fields. May return NULL if we're
|
187
132
|
// already over the max_errors field specified in GumboOptions.
|
188
133
|
GumboError* gumbo_add_error(struct GumboInternalParser* parser);
|
189
134
|
|
@@ -194,32 +139,10 @@ void gumbo_init_errors(struct GumboInternalParser* errors);
|
|
194
139
|
void gumbo_destroy_errors(struct GumboInternalParser* errors);
|
195
140
|
|
196
141
|
// Frees the memory used for a single GumboError.
|
197
|
-
void gumbo_error_destroy(
|
198
|
-
|
199
|
-
// Prints an error to a string. This fills an empty GumboStringBuffer with a
|
200
|
-
// freshly-allocated buffer containing the error message text. The caller is
|
201
|
-
// responsible for deleting the buffer. (Note that the buffer is allocated with
|
202
|
-
// the allocator specified in the GumboParser config and hence should be freed
|
203
|
-
// by gumbo_parser_deallocate().)
|
204
|
-
void gumbo_error_to_string(struct GumboInternalParser* parser,
|
205
|
-
const GumboError* error, GumboStringBuffer* output);
|
206
|
-
|
207
|
-
// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
|
208
|
-
// with a freshly-allocated buffer containing the error message text. The
|
209
|
-
// caller is responsible for deleting the buffer. (Note that the buffer is
|
210
|
-
// allocated with the allocator specified in the GumboParser config and hence
|
211
|
-
// should be freed by gumbo_parser_deallocate().)
|
212
|
-
void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
|
213
|
-
const GumboError* error, const char* source_text,
|
214
|
-
GumboStringBuffer* output);
|
215
|
-
|
216
|
-
// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
|
217
|
-
// of writing to a string.
|
218
|
-
void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
|
219
|
-
const GumboError* error, const char* source_text);
|
142
|
+
void gumbo_error_destroy(GumboError* error);
|
220
143
|
|
221
144
|
#ifdef __cplusplus
|
222
145
|
}
|
223
146
|
#endif
|
224
147
|
|
225
|
-
#endif
|
148
|
+
#endif // GUMBO_ERROR_H_
|
@@ -0,0 +1,104 @@
|
|
1
|
+
/* ANSI-C code produced by gperf version 3.1 */
|
2
|
+
/* Command-line: gperf -m100 -n lib/foreign_attrs.gperf */
|
3
|
+
/* Computed positions: -k'2,8' */
|
4
|
+
/* Filtered by: mk/gperf-filter.sed */
|
5
|
+
|
6
|
+
#include "replacement.h"
|
7
|
+
#include "macros.h"
|
8
|
+
#include <string.h>
|
9
|
+
|
10
|
+
#define TOTAL_KEYWORDS 11
|
11
|
+
#define MIN_WORD_LENGTH 5
|
12
|
+
#define MAX_WORD_LENGTH 13
|
13
|
+
#define MIN_HASH_VALUE 0
|
14
|
+
#define MAX_HASH_VALUE 10
|
15
|
+
/* maximum key range = 11, duplicates = 0 */
|
16
|
+
|
17
|
+
static inline unsigned int
|
18
|
+
hash (register const char *str, register size_t len)
|
19
|
+
{
|
20
|
+
static const unsigned char asso_values[] =
|
21
|
+
{
|
22
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
23
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
24
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
25
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
26
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
27
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
28
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
29
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
30
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
31
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 2,
|
32
|
+
11, 10, 11, 9, 7, 6, 11, 11, 1, 0,
|
33
|
+
11, 5, 11, 11, 4, 11, 11, 11, 11, 11,
|
34
|
+
11, 3, 11, 11, 11, 11, 11, 11, 11, 11,
|
35
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
36
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
37
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
38
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
39
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
40
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
41
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
42
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
43
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
44
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
45
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
46
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
47
|
+
11, 11, 11, 11, 11, 11
|
48
|
+
};
|
49
|
+
register unsigned int hval = 0;
|
50
|
+
|
51
|
+
switch (len)
|
52
|
+
{
|
53
|
+
default:
|
54
|
+
hval += asso_values[(unsigned char)str[7]];
|
55
|
+
/*FALLTHROUGH*/
|
56
|
+
case 7:
|
57
|
+
case 6:
|
58
|
+
case 5:
|
59
|
+
case 4:
|
60
|
+
case 3:
|
61
|
+
case 2:
|
62
|
+
hval += asso_values[(unsigned char)str[1]];
|
63
|
+
break;
|
64
|
+
}
|
65
|
+
return hval;
|
66
|
+
}
|
67
|
+
|
68
|
+
const ForeignAttrReplacement *
|
69
|
+
gumbo_get_foreign_attr_replacement (register const char *str, register size_t len)
|
70
|
+
{
|
71
|
+
static const unsigned char lengthtable[] =
|
72
|
+
{
|
73
|
+
5, 11, 9, 13, 10, 10, 10, 11, 10, 8, 8
|
74
|
+
};
|
75
|
+
static const ForeignAttrReplacement wordlist[] =
|
76
|
+
{
|
77
|
+
{"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
|
78
|
+
{"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
|
79
|
+
{"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
|
80
|
+
{"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
|
81
|
+
{"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
|
82
|
+
{"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
|
83
|
+
{"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
|
84
|
+
{"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
|
85
|
+
{"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
|
86
|
+
{"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
|
87
|
+
{"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}
|
88
|
+
};
|
89
|
+
|
90
|
+
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
|
91
|
+
{
|
92
|
+
register unsigned int key = hash (str, len);
|
93
|
+
|
94
|
+
if (key <= MAX_HASH_VALUE)
|
95
|
+
if (len == lengthtable[key])
|
96
|
+
{
|
97
|
+
register const char *s = wordlist[key].from;
|
98
|
+
|
99
|
+
if (s && *str == *s && !memcmp (str + 1, s + 1, len - 1))
|
100
|
+
return &wordlist[key];
|
101
|
+
}
|
102
|
+
}
|
103
|
+
return 0;
|
104
|
+
}
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -1,51 +1,33 @@
|
|
1
|
-
// Copyright 2010 Google Inc.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License,
|
4
|
-
|
5
|
-
//
|
6
|
-
//
|
7
|
-
//
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
//
|
17
|
-
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
|
18
|
-
// GUMBO_ as a prefix for enum constants (static constants get the Google-style
|
19
|
-
// kGumbo prefix).
|
1
|
+
// Copyright 2010 Google Inc.
|
2
|
+
// Copyright 2018 Craig Barnes.
|
3
|
+
// Licensed under the Apache License, version 2.0.
|
4
|
+
|
5
|
+
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
|
6
|
+
// GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
|
7
|
+
// static constants
|
20
8
|
|
21
9
|
/**
|
22
10
|
* @file
|
23
11
|
* @mainpage Gumbo HTML Parser
|
24
12
|
*
|
25
|
-
* This provides a conformant, no-dependencies implementation of the
|
26
|
-
* parsing algorithm.
|
27
|
-
* encoding, run a preprocessing step to convert
|
28
|
-
* tree made of the structs in this file.
|
13
|
+
* This provides a conformant, no-dependencies implementation of the
|
14
|
+
* [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
|
15
|
+
* to parse a different encoding, run a preprocessing step to convert
|
16
|
+
* to UTF-8. It returns a parse tree made of the structs in this file.
|
29
17
|
*
|
30
18
|
* Example:
|
31
19
|
* @code
|
32
20
|
* GumboOutput* output = gumbo_parse(input);
|
33
21
|
* do_something_with_doctype(output->document);
|
34
22
|
* do_something_with_html_tree(output->root);
|
35
|
-
* gumbo_destroy_output(
|
23
|
+
* gumbo_destroy_output(output);
|
36
24
|
* @endcode
|
37
|
-
* HTML5 Spec:
|
38
25
|
*
|
39
|
-
*
|
26
|
+
* [HTML5]: https://html.spec.whatwg.org/multipage/
|
40
27
|
*/
|
41
28
|
|
42
|
-
#ifndef
|
43
|
-
#define
|
44
|
-
|
45
|
-
#ifdef _MSC_VER
|
46
|
-
#define _CRT_SECURE_NO_WARNINGS
|
47
|
-
#define fileno _fileno
|
48
|
-
#endif
|
29
|
+
#ifndef GUMBO_H
|
30
|
+
#define GUMBO_H
|
49
31
|
|
50
32
|
#include <stdbool.h>
|
51
33
|
#include <stddef.h>
|
@@ -55,73 +37,77 @@ extern "C" {
|
|
55
37
|
#endif
|
56
38
|
|
57
39
|
/**
|
58
|
-
* A struct representing a character position within the original text
|
59
|
-
* Line and column numbers are 1-based and offsets are 0-based,
|
60
|
-
* how most editors and command-line tools work.
|
61
|
-
* positions in terms of characters while offsets measure by bytes; this is
|
62
|
-
* because the offset field is often used to pull out a particular region of
|
63
|
-
* text (which in most languages that bind to C implies pointer arithmetic on a
|
64
|
-
* buffer of bytes), while the column field is often used to reference a
|
65
|
-
* particular column on a printable display, which nowadays is usually UTF-8.
|
40
|
+
* A struct representing a character position within the original text
|
41
|
+
* buffer. Line and column numbers are 1-based and offsets are 0-based,
|
42
|
+
* which matches how most editors and command-line tools work.
|
66
43
|
*/
|
67
44
|
typedef struct {
|
68
|
-
|
69
|
-
|
70
|
-
|
45
|
+
size_t line;
|
46
|
+
size_t column;
|
47
|
+
size_t offset;
|
71
48
|
} GumboSourcePosition;
|
72
49
|
|
73
50
|
/**
|
74
|
-
* A
|
75
|
-
* parser
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
*
|
81
|
-
* parser are represented by a char* and a length; the char* points into
|
82
|
-
* an existing data buffer owned by some other code (often the original input).
|
83
|
-
* GumboStringPieces are assumed (by convention) to be immutable, because they
|
84
|
-
* may share data. Use GumboStringBuffer if you need to construct a string.
|
85
|
-
* Clients should assume that it is not NUL-terminated, and should always use
|
86
|
-
* explicit lengths when manipulating them.
|
51
|
+
* A struct representing a string or part of a string. Strings within
|
52
|
+
* the parser are represented by a `char*` and a length; the `char*`
|
53
|
+
* points into an existing data buffer owned by some other code (often
|
54
|
+
* the original input). `GumboStringPiece`s are assumed (by convention)
|
55
|
+
* to be immutable, because they may share data. Clients should assume
|
56
|
+
* that it is not NUL-terminated and should always use explicit lengths
|
57
|
+
* when manipulating them.
|
87
58
|
*/
|
88
59
|
typedef struct {
|
89
|
-
/** A pointer to the beginning of the string.
|
60
|
+
/** A pointer to the beginning of the string. `NULL` if `length == 0`. */
|
90
61
|
const char* data;
|
91
62
|
|
92
|
-
/** The length of the string fragment, in bytes
|
63
|
+
/** The length of the string fragment, in bytes (may be zero). */
|
93
64
|
size_t length;
|
94
65
|
} GumboStringPiece;
|
95
66
|
|
67
|
+
#define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
|
96
68
|
/** A constant to represent a 0-length null string. */
|
97
|
-
|
69
|
+
#define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
|
98
70
|
|
99
71
|
/**
|
100
|
-
* Compares two
|
101
|
-
* otherwise.
|
72
|
+
* Compares two `GumboStringPiece`s, and returns `true` if they're
|
73
|
+
* equal or `false` otherwise.
|
102
74
|
*/
|
103
|
-
bool gumbo_string_equals(
|
104
|
-
|
75
|
+
bool gumbo_string_equals (
|
76
|
+
const GumboStringPiece* str1,
|
77
|
+
const GumboStringPiece* str2
|
78
|
+
);
|
105
79
|
|
106
80
|
/**
|
107
|
-
* Compares two
|
108
|
-
* equal or false otherwise.
|
81
|
+
* Compares two `GumboStringPiece`s, ignoring case, and returns `true`
|
82
|
+
* if they're equal or `false` otherwise.
|
109
83
|
*/
|
110
|
-
bool gumbo_string_equals_ignore_case(
|
111
|
-
|
84
|
+
bool gumbo_string_equals_ignore_case (
|
85
|
+
const GumboStringPiece* str1,
|
86
|
+
const GumboStringPiece* str2
|
87
|
+
);
|
112
88
|
|
113
89
|
/**
|
114
|
-
*
|
115
|
-
*
|
116
|
-
|
117
|
-
|
118
|
-
*
|
119
|
-
*
|
120
|
-
|
90
|
+
* Check if the first `GumboStringPiece` is a prefix of the second, ignoring
|
91
|
+
* case.
|
92
|
+
*/
|
93
|
+
bool gumbo_string_prefix_ignore_case (
|
94
|
+
const GumboStringPiece* prefix,
|
95
|
+
const GumboStringPiece* str
|
96
|
+
);
|
97
|
+
|
98
|
+
/**
|
99
|
+
* A simple vector implementation. This stores a pointer to a data array
|
100
|
+
* and a length. All elements are stored as `void*`; client code must
|
101
|
+
* cast to the appropriate type. Overflows upon addition result in
|
102
|
+
* reallocation of the data array, with the size doubling to maintain
|
103
|
+
* `O(1)` amortized cost. There is no removal function, as this isn't
|
104
|
+
* needed for any of the operations within this library. Iteration can
|
105
|
+
* be done through inspecting the structure directly in a `for` loop.
|
121
106
|
*/
|
122
107
|
typedef struct {
|
123
|
-
/**
|
124
|
-
* elements
|
108
|
+
/**
|
109
|
+
* Data elements. This points to a dynamically-allocated array of
|
110
|
+
* `capacity` elements, each a `void*` to the element itself.
|
125
111
|
*/
|
126
112
|
void** data;
|
127
113
|
|
@@ -132,82 +118,230 @@ typedef struct {
|
|
132
118
|
unsigned int capacity;
|
133
119
|
} GumboVector;
|
134
120
|
|
135
|
-
|
136
|
-
|
121
|
+
# define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
|
122
|
+
/** An empty (0-length, 0-capacity) `GumboVector`. */
|
123
|
+
#define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
|
137
124
|
|
138
125
|
/**
|
139
|
-
* Returns the first index at which an element appears in this vector
|
140
|
-
* by pointer equality), or
|
126
|
+
* Returns the first index at which an element appears in this vector
|
127
|
+
* (testing by pointer equality), or `-1` if it never does.
|
141
128
|
*/
|
142
129
|
int gumbo_vector_index_of(GumboVector* vector, const void* element);
|
143
130
|
|
144
131
|
/**
|
145
|
-
* An enum for all the tags defined in the HTML5 standard.
|
146
|
-
* the tag names themselves.
|
147
|
-
* the spec itself (or for tags with special
|
148
|
-
*
|
149
|
-
* name can be obtained
|
132
|
+
* An `enum` for all the tags defined in the HTML5 standard. These
|
133
|
+
* correspond to the tag names themselves. Enum constants exist only
|
134
|
+
* for tags that appear in the spec itself (or for tags with special
|
135
|
+
* handling in the SVG and MathML namespaces). Any other tags appear
|
136
|
+
* as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
|
137
|
+
* through `original_tag`.
|
150
138
|
*
|
151
|
-
* This is mostly for API convenience, so that clients of this library
|
152
|
-
* need to perform a strcasecmp to find the normalized tag
|
153
|
-
* efficiency benefits, by letting the parser work
|
154
|
-
* strings.
|
139
|
+
* This is mostly for API convenience, so that clients of this library
|
140
|
+
* don't need to perform a `strcasecmp` to find the normalized tag
|
141
|
+
* name. It also has efficiency benefits, by letting the parser work
|
142
|
+
* with enums instead of strings.
|
155
143
|
*/
|
156
144
|
typedef enum {
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
145
|
+
GUMBO_TAG_HTML,
|
146
|
+
GUMBO_TAG_HEAD,
|
147
|
+
GUMBO_TAG_TITLE,
|
148
|
+
GUMBO_TAG_BASE,
|
149
|
+
GUMBO_TAG_LINK,
|
150
|
+
GUMBO_TAG_META,
|
151
|
+
GUMBO_TAG_STYLE,
|
152
|
+
GUMBO_TAG_SCRIPT,
|
153
|
+
GUMBO_TAG_NOSCRIPT,
|
154
|
+
GUMBO_TAG_TEMPLATE,
|
155
|
+
GUMBO_TAG_BODY,
|
156
|
+
GUMBO_TAG_ARTICLE,
|
157
|
+
GUMBO_TAG_SECTION,
|
158
|
+
GUMBO_TAG_NAV,
|
159
|
+
GUMBO_TAG_ASIDE,
|
160
|
+
GUMBO_TAG_H1,
|
161
|
+
GUMBO_TAG_H2,
|
162
|
+
GUMBO_TAG_H3,
|
163
|
+
GUMBO_TAG_H4,
|
164
|
+
GUMBO_TAG_H5,
|
165
|
+
GUMBO_TAG_H6,
|
166
|
+
GUMBO_TAG_HGROUP,
|
167
|
+
GUMBO_TAG_HEADER,
|
168
|
+
GUMBO_TAG_FOOTER,
|
169
|
+
GUMBO_TAG_ADDRESS,
|
170
|
+
GUMBO_TAG_P,
|
171
|
+
GUMBO_TAG_HR,
|
172
|
+
GUMBO_TAG_PRE,
|
173
|
+
GUMBO_TAG_BLOCKQUOTE,
|
174
|
+
GUMBO_TAG_OL,
|
175
|
+
GUMBO_TAG_UL,
|
176
|
+
GUMBO_TAG_LI,
|
177
|
+
GUMBO_TAG_DL,
|
178
|
+
GUMBO_TAG_DT,
|
179
|
+
GUMBO_TAG_DD,
|
180
|
+
GUMBO_TAG_FIGURE,
|
181
|
+
GUMBO_TAG_FIGCAPTION,
|
182
|
+
GUMBO_TAG_MAIN,
|
183
|
+
GUMBO_TAG_DIV,
|
184
|
+
GUMBO_TAG_A,
|
185
|
+
GUMBO_TAG_EM,
|
186
|
+
GUMBO_TAG_STRONG,
|
187
|
+
GUMBO_TAG_SMALL,
|
188
|
+
GUMBO_TAG_S,
|
189
|
+
GUMBO_TAG_CITE,
|
190
|
+
GUMBO_TAG_Q,
|
191
|
+
GUMBO_TAG_DFN,
|
192
|
+
GUMBO_TAG_ABBR,
|
193
|
+
GUMBO_TAG_DATA,
|
194
|
+
GUMBO_TAG_TIME,
|
195
|
+
GUMBO_TAG_CODE,
|
196
|
+
GUMBO_TAG_VAR,
|
197
|
+
GUMBO_TAG_SAMP,
|
198
|
+
GUMBO_TAG_KBD,
|
199
|
+
GUMBO_TAG_SUB,
|
200
|
+
GUMBO_TAG_SUP,
|
201
|
+
GUMBO_TAG_I,
|
202
|
+
GUMBO_TAG_B,
|
203
|
+
GUMBO_TAG_U,
|
204
|
+
GUMBO_TAG_MARK,
|
205
|
+
GUMBO_TAG_RUBY,
|
206
|
+
GUMBO_TAG_RT,
|
207
|
+
GUMBO_TAG_RP,
|
208
|
+
GUMBO_TAG_BDI,
|
209
|
+
GUMBO_TAG_BDO,
|
210
|
+
GUMBO_TAG_SPAN,
|
211
|
+
GUMBO_TAG_BR,
|
212
|
+
GUMBO_TAG_WBR,
|
213
|
+
GUMBO_TAG_INS,
|
214
|
+
GUMBO_TAG_DEL,
|
215
|
+
GUMBO_TAG_IMAGE,
|
216
|
+
GUMBO_TAG_IMG,
|
217
|
+
GUMBO_TAG_IFRAME,
|
218
|
+
GUMBO_TAG_EMBED,
|
219
|
+
GUMBO_TAG_OBJECT,
|
220
|
+
GUMBO_TAG_PARAM,
|
221
|
+
GUMBO_TAG_VIDEO,
|
222
|
+
GUMBO_TAG_AUDIO,
|
223
|
+
GUMBO_TAG_SOURCE,
|
224
|
+
GUMBO_TAG_TRACK,
|
225
|
+
GUMBO_TAG_CANVAS,
|
226
|
+
GUMBO_TAG_MAP,
|
227
|
+
GUMBO_TAG_AREA,
|
228
|
+
GUMBO_TAG_MATH,
|
229
|
+
GUMBO_TAG_MI,
|
230
|
+
GUMBO_TAG_MO,
|
231
|
+
GUMBO_TAG_MN,
|
232
|
+
GUMBO_TAG_MS,
|
233
|
+
GUMBO_TAG_MTEXT,
|
234
|
+
GUMBO_TAG_MGLYPH,
|
235
|
+
GUMBO_TAG_MALIGNMARK,
|
236
|
+
GUMBO_TAG_ANNOTATION_XML,
|
237
|
+
GUMBO_TAG_SVG,
|
238
|
+
GUMBO_TAG_FOREIGNOBJECT,
|
239
|
+
GUMBO_TAG_DESC,
|
240
|
+
GUMBO_TAG_TABLE,
|
241
|
+
GUMBO_TAG_CAPTION,
|
242
|
+
GUMBO_TAG_COLGROUP,
|
243
|
+
GUMBO_TAG_COL,
|
244
|
+
GUMBO_TAG_TBODY,
|
245
|
+
GUMBO_TAG_THEAD,
|
246
|
+
GUMBO_TAG_TFOOT,
|
247
|
+
GUMBO_TAG_TR,
|
248
|
+
GUMBO_TAG_TD,
|
249
|
+
GUMBO_TAG_TH,
|
250
|
+
GUMBO_TAG_FORM,
|
251
|
+
GUMBO_TAG_FIELDSET,
|
252
|
+
GUMBO_TAG_LEGEND,
|
253
|
+
GUMBO_TAG_LABEL,
|
254
|
+
GUMBO_TAG_INPUT,
|
255
|
+
GUMBO_TAG_BUTTON,
|
256
|
+
GUMBO_TAG_SELECT,
|
257
|
+
GUMBO_TAG_DATALIST,
|
258
|
+
GUMBO_TAG_OPTGROUP,
|
259
|
+
GUMBO_TAG_OPTION,
|
260
|
+
GUMBO_TAG_TEXTAREA,
|
261
|
+
GUMBO_TAG_KEYGEN,
|
262
|
+
GUMBO_TAG_OUTPUT,
|
263
|
+
GUMBO_TAG_PROGRESS,
|
264
|
+
GUMBO_TAG_METER,
|
265
|
+
GUMBO_TAG_DETAILS,
|
266
|
+
GUMBO_TAG_SUMMARY,
|
267
|
+
GUMBO_TAG_MENU,
|
268
|
+
GUMBO_TAG_MENUITEM,
|
269
|
+
GUMBO_TAG_APPLET,
|
270
|
+
GUMBO_TAG_ACRONYM,
|
271
|
+
GUMBO_TAG_BGSOUND,
|
272
|
+
GUMBO_TAG_DIR,
|
273
|
+
GUMBO_TAG_FRAME,
|
274
|
+
GUMBO_TAG_FRAMESET,
|
275
|
+
GUMBO_TAG_NOFRAMES,
|
276
|
+
GUMBO_TAG_LISTING,
|
277
|
+
GUMBO_TAG_XMP,
|
278
|
+
GUMBO_TAG_NEXTID,
|
279
|
+
GUMBO_TAG_NOEMBED,
|
280
|
+
GUMBO_TAG_PLAINTEXT,
|
281
|
+
GUMBO_TAG_RB,
|
282
|
+
GUMBO_TAG_STRIKE,
|
283
|
+
GUMBO_TAG_BASEFONT,
|
284
|
+
GUMBO_TAG_BIG,
|
285
|
+
GUMBO_TAG_BLINK,
|
286
|
+
GUMBO_TAG_CENTER,
|
287
|
+
GUMBO_TAG_FONT,
|
288
|
+
GUMBO_TAG_MARQUEE,
|
289
|
+
GUMBO_TAG_MULTICOL,
|
290
|
+
GUMBO_TAG_NOBR,
|
291
|
+
GUMBO_TAG_SPACER,
|
292
|
+
GUMBO_TAG_TT,
|
293
|
+
GUMBO_TAG_RTC,
|
294
|
+
GUMBO_TAG_DIALOG,
|
295
|
+
// Used for all tags that don't have special handling in HTML.
|
161
296
|
GUMBO_TAG_UNKNOWN,
|
162
297
|
// A marker value to indicate the end of the enum, for iterating over it.
|
163
|
-
// Also used as the terminator for varargs functions that take tags.
|
164
298
|
GUMBO_TAG_LAST,
|
165
299
|
} GumboTag;
|
166
300
|
|
167
301
|
/**
|
168
|
-
* Returns the normalized (
|
169
|
-
*
|
170
|
-
* library.
|
302
|
+
* Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
|
303
|
+
* return value is static data owned by the library.
|
171
304
|
*/
|
172
305
|
const char* gumbo_normalized_tagname(GumboTag tag);
|
173
306
|
|
174
307
|
/**
|
175
|
-
* Extracts the tag name from the original_text field of an element
|
176
|
-
* stripping off
|
177
|
-
* GumboStringPiece appropriately.
|
178
|
-
* shares a buffer with the original
|
179
|
-
* Behavior is undefined if a
|
180
|
-
*
|
181
|
-
*
|
182
|
-
*
|
308
|
+
* Extracts the tag name from the `original_text` field of an element
|
309
|
+
* or token by stripping off `</>` characters and attributes and
|
310
|
+
* adjusting the passed-in `GumboStringPiece` appropriately. The tag
|
311
|
+
* name is in the original case and shares a buffer with the original
|
312
|
+
* text, to simplify memory management. Behavior is undefined if a
|
313
|
+
* string piece that doesn't represent an HTML tag (`<tagname>` or
|
314
|
+
* `</tagname>`) is passed in. If the string piece is completely
|
315
|
+
* empty (`NULL` data pointer), then this function will exit
|
316
|
+
* successfully as a no-op.
|
183
317
|
*/
|
184
318
|
void gumbo_tag_from_original_text(GumboStringPiece* text);
|
185
319
|
|
186
320
|
/**
|
187
|
-
* Fixes the case of SVG elements that are not all lowercase.
|
188
|
-
*
|
189
|
-
*
|
190
|
-
*
|
191
|
-
*
|
192
|
-
*
|
193
|
-
*
|
194
|
-
*
|
195
|
-
*
|
196
|
-
*
|
321
|
+
* Fixes the case of SVG elements that are not all lowercase. This is
|
322
|
+
* not done at parse time because there's no place to store a mutated
|
323
|
+
* tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
|
324
|
+
* SVG tags without special handling), while `original_tag_name` is a
|
325
|
+
* pointer into the original buffer. Instead, we provide this helper
|
326
|
+
* function that clients can use to rename SVG tags as appropriate.
|
327
|
+
* Returns the case-normalized SVG tagname if a replacement is found, or
|
328
|
+
* `NULL` if no normalization is called for. The return value is static
|
329
|
+
* data and owned by the library.
|
330
|
+
*
|
331
|
+
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
197
332
|
*/
|
198
333
|
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
199
334
|
|
200
335
|
/**
|
201
|
-
* Converts a tag name string (which may be in upper or mixed case) to a
|
202
|
-
* enum.
|
336
|
+
* Converts a tag name string (which may be in upper or mixed case) to a
|
337
|
+
* tag enum.
|
203
338
|
*/
|
204
|
-
GumboTag
|
205
|
-
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
|
339
|
+
GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
|
206
340
|
|
207
341
|
/**
|
208
342
|
* Attribute namespaces.
|
209
|
-
* HTML includes special handling for XLink, XML, and XMLNS namespaces
|
210
|
-
* attributes.
|
343
|
+
* HTML includes special handling for XLink, XML, and XMLNS namespaces
|
344
|
+
* on attributes. Everything else goes in the generic "NONE" namespace.
|
211
345
|
*/
|
212
346
|
typedef enum {
|
213
347
|
GUMBO_ATTR_NAMESPACE_NONE,
|
@@ -217,46 +351,47 @@ typedef enum {
|
|
217
351
|
} GumboAttributeNamespaceEnum;
|
218
352
|
|
219
353
|
/**
|
220
|
-
* A struct representing a single attribute on
|
221
|
-
* name-value pair, but also includes information about source locations
|
222
|
-
* original source text.
|
354
|
+
* A struct representing a single attribute on a HTML tag. This is a
|
355
|
+
* name-value pair, but also includes information about source locations
|
356
|
+
* and original source text.
|
223
357
|
*/
|
224
358
|
typedef struct {
|
225
359
|
/**
|
226
|
-
* The namespace for the attribute.
|
227
|
-
* GUMBO_ATTR_NAMESPACE_NONE
|
228
|
-
* values, per:
|
229
|
-
*
|
360
|
+
* The namespace for the attribute. This will usually be
|
361
|
+
* `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
|
362
|
+
* take special values, per:
|
363
|
+
* https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
|
230
364
|
*/
|
231
365
|
GumboAttributeNamespaceEnum attr_namespace;
|
232
366
|
|
233
367
|
/**
|
234
|
-
* The name of the attribute.
|
235
|
-
* with case-normalization
|
368
|
+
* The name of the attribute. This is in a freshly-allocated buffer to
|
369
|
+
* deal with case-normalization and is null-terminated.
|
236
370
|
*/
|
237
371
|
const char* name;
|
238
372
|
|
239
373
|
/**
|
240
|
-
* The original text of the attribute name, as a pointer into the
|
241
|
-
* source buffer.
|
374
|
+
* The original text of the attribute name, as a pointer into the
|
375
|
+
* original source buffer.
|
242
376
|
*/
|
243
377
|
GumboStringPiece original_name;
|
244
378
|
|
245
379
|
/**
|
246
|
-
* The value of the attribute.
|
247
|
-
* with unescaping
|
248
|
-
* that surround the attribute.
|
249
|
-
*
|
380
|
+
* The value of the attribute. This is in a freshly-allocated buffer
|
381
|
+
* to deal with unescaping and is null-terminated. It does not include
|
382
|
+
* any quotes that surround the attribute. If the attribute has no
|
383
|
+
* value (for example, `selected` on a checkbox) this will be an empty
|
384
|
+
* string.
|
250
385
|
*/
|
251
386
|
const char* value;
|
252
387
|
|
253
388
|
/**
|
254
|
-
* The original text of the value of the attribute.
|
255
|
-
* original source buffer.
|
256
|
-
* attribute
|
257
|
-
* original_value.data[original_value.length - 1] to determine what
|
258
|
-
* characters were.
|
259
|
-
* string.
|
389
|
+
* The original text of the value of the attribute. This points into
|
390
|
+
* the original source buffer. It includes any quotes that surround
|
391
|
+
* the attribute and you can look at `original_value.data[0]` and
|
392
|
+
* `original_value.data[original_value.length - 1]` to determine what
|
393
|
+
* the quote characters were. If the attribute has no value this will
|
394
|
+
* be a 0-length string.
|
260
395
|
*/
|
261
396
|
GumboStringPiece original_value;
|
262
397
|
|
@@ -264,9 +399,9 @@ typedef struct {
|
|
264
399
|
GumboSourcePosition name_start;
|
265
400
|
|
266
401
|
/**
|
267
|
-
* The ending position of the attribute name.
|
402
|
+
* The ending position of the attribute name. This is not always derivable
|
268
403
|
* from the starting position of the value because of the possibility of
|
269
|
-
* whitespace around the
|
404
|
+
* whitespace around the `=` sign.
|
270
405
|
*/
|
271
406
|
GumboSourcePosition name_end;
|
272
407
|
|
@@ -278,34 +413,37 @@ typedef struct {
|
|
278
413
|
} GumboAttribute;
|
279
414
|
|
280
415
|
/**
|
281
|
-
* Given a vector of
|
282
|
-
* and return it, or NULL if no such attribute exists.
|
283
|
-
* case-insensitive match, as HTML is case-insensitive.
|
416
|
+
* Given a vector of `GumboAttribute`s, look up the one with the
|
417
|
+
* specified name and return it, or `NULL` if no such attribute exists.
|
418
|
+
* This uses a case-insensitive match, as HTML is case-insensitive.
|
284
419
|
*/
|
285
420
|
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
|
286
421
|
|
287
422
|
/**
|
288
|
-
* Enum denoting the type of node.
|
289
|
-
* union.
|
423
|
+
* Enum denoting the type of node. This determines the type of the
|
424
|
+
* `node.v` union.
|
290
425
|
*/
|
291
426
|
typedef enum {
|
292
|
-
/** Document node.
|
427
|
+
/** Document node. `v` will be a `GumboDocument`. */
|
293
428
|
GUMBO_NODE_DOCUMENT,
|
294
|
-
/** Element node.
|
429
|
+
/** Element node. `v` will be a `GumboElement`. */
|
295
430
|
GUMBO_NODE_ELEMENT,
|
296
|
-
/** Text node.
|
431
|
+
/** Text node. `v` will be a `GumboText`. */
|
297
432
|
GUMBO_NODE_TEXT,
|
298
|
-
/** CDATA node. v will be a GumboText
|
433
|
+
/** CDATA node. `v` will be a `GumboText`. */
|
299
434
|
GUMBO_NODE_CDATA,
|
300
|
-
/** Comment node.
|
435
|
+
/** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
|
301
436
|
GUMBO_NODE_COMMENT,
|
302
|
-
/** Text node, where all contents is whitespace.
|
437
|
+
/** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
|
303
438
|
GUMBO_NODE_WHITESPACE,
|
304
|
-
/**
|
305
|
-
*
|
306
|
-
*
|
307
|
-
*
|
308
|
-
*
|
439
|
+
/**
|
440
|
+
* Template node. This is separate from `GUMBO_NODE_ELEMENT` because
|
441
|
+
* many client libraries will want to ignore the contents of template
|
442
|
+
* nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
|
443
|
+
* do the right thing here, while clients that want to include template
|
444
|
+
* contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
|
445
|
+
* `GumboElement`.
|
446
|
+
*/
|
309
447
|
GUMBO_NODE_TEMPLATE
|
310
448
|
} GumboNodeType;
|
311
449
|
|
@@ -315,9 +453,7 @@ typedef enum {
|
|
315
453
|
*/
|
316
454
|
typedef struct GumboInternalNode GumboNode;
|
317
455
|
|
318
|
-
/**
|
319
|
-
* http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode
|
320
|
-
*/
|
456
|
+
/** https://dom.spec.whatwg.org/#concept-document-quirks */
|
321
457
|
typedef enum {
|
322
458
|
GUMBO_DOCTYPE_NO_QUIRKS,
|
323
459
|
GUMBO_DOCTYPE_QUIRKS,
|
@@ -326,10 +462,11 @@ typedef enum {
|
|
326
462
|
|
327
463
|
/**
|
328
464
|
* Namespaces.
|
329
|
-
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
|
330
|
-
* anything inside an
|
331
|
-
*
|
332
|
-
*
|
465
|
+
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
|
466
|
+
* Rather, anything inside an `<svg>` tag is in the SVG namespace,
|
467
|
+
* anything inside the `<math>` tag is in the MathML namespace, and
|
468
|
+
* anything else is inside the HTML namespace. No other namespaces are
|
469
|
+
* supported, so this can be an `enum`.
|
333
470
|
*/
|
334
471
|
typedef enum {
|
335
472
|
GUMBO_NAMESPACE_HTML,
|
@@ -339,66 +476,70 @@ typedef enum {
|
|
339
476
|
|
340
477
|
/**
|
341
478
|
* Parse flags.
|
342
|
-
* We track the reasons for parser insertion of nodes and store them in
|
343
|
-
* bitvector in the node itself.
|
344
|
-
* are implied by the HTML structure of the document, or flag
|
345
|
-
* may not be allowed by a style guide, or track the
|
346
|
-
* tricky HTML code.
|
479
|
+
* We track the reasons for parser insertion of nodes and store them in
|
480
|
+
* a bitvector in the node itself. This lets client code optimize out
|
481
|
+
* nodes that are implied by the HTML structure of the document, or flag
|
482
|
+
* constructs that may not be allowed by a style guide, or track the
|
483
|
+
* prevalence of incorrect or tricky HTML code.
|
347
484
|
*/
|
348
485
|
typedef enum {
|
349
486
|
/**
|
350
|
-
* A normal node
|
351
|
-
* been reparented.
|
487
|
+
* A normal node -- both start and end tags appear in the source,
|
488
|
+
* nothing has been reparented.
|
352
489
|
*/
|
353
490
|
GUMBO_INSERTION_NORMAL = 0,
|
354
491
|
|
355
492
|
/**
|
356
|
-
* A node inserted by the parser to fulfill some implicit insertion
|
357
|
-
* This is usually set in addition to some other flag giving a
|
358
|
-
* insertion reason; it's a generic catch-all term
|
359
|
-
* this node did not appear in the document
|
493
|
+
* A node inserted by the parser to fulfill some implicit insertion
|
494
|
+
* rule. This is usually set in addition to some other flag giving a
|
495
|
+
* more specific insertion reason; it's a generic catch-all term
|
496
|
+
* meaning "The start tag for this node did not appear in the document
|
497
|
+
* source".
|
360
498
|
*/
|
361
499
|
GUMBO_INSERTION_BY_PARSER = 1 << 0,
|
362
500
|
|
363
501
|
/**
|
364
|
-
* A flag indicating that the end tag for this node did not appear in
|
365
|
-
* document source.
|
366
|
-
* parser-inserted nodes with an explicit end tag
|
367
|
-
* has GUMBO_INSERTED_BY_PARSER set on the
|
368
|
-
* GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the
|
369
|
-
*
|
370
|
-
*
|
371
|
-
*
|
372
|
-
*
|
502
|
+
* A flag indicating that the end tag for this node did not appear in
|
503
|
+
* the document source. Note that in some cases, you can still have
|
504
|
+
* parser-inserted nodes with an explicit end tag. For example,
|
505
|
+
* `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
|
506
|
+
* node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
|
507
|
+
* `</html>` tag actually exists.
|
508
|
+
*
|
509
|
+
* This flag will be set only if the end tag is completely missing.
|
510
|
+
* In some cases, the end tag may be misplaced (e.g. a `</body>` tag
|
511
|
+
* with text afterwards), which will leave this flag unset and require
|
512
|
+
* clients to inspect the parse errors for that case.
|
373
513
|
*/
|
374
514
|
GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
|
375
515
|
|
376
516
|
// Value 1 << 2 was for a flag that has since been removed.
|
377
517
|
|
378
518
|
/**
|
379
|
-
* A flag for nodes that are inserted because their presence is
|
380
|
-
* other tags,
|
519
|
+
* A flag for nodes that are inserted because their presence is
|
520
|
+
* implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
|
521
|
+
* `<tbody>`, etc.
|
381
522
|
*/
|
382
523
|
GUMBO_INSERTION_IMPLIED = 1 << 3,
|
383
524
|
|
384
525
|
/**
|
385
|
-
* A flag for nodes that are converted from their end tag equivalents.
|
386
|
-
* example,
|
387
|
-
* create a
|
388
|
-
* as
|
526
|
+
* A flag for nodes that are converted from their end tag equivalents.
|
527
|
+
* For example, `</p>` when no paragraph is open implies that the
|
528
|
+
* parser should create a `<p>` tag and immediately close it, while
|
529
|
+
* `</br>` means the same thing as `<br>`.
|
389
530
|
*/
|
390
531
|
GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
|
391
532
|
|
392
|
-
|
393
|
-
GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
|
533
|
+
// Value 1 << 5 was for a flag that has since been removed.
|
394
534
|
|
395
|
-
/** A flag for
|
535
|
+
/** A flag for `<image>` tags that are rewritten as `<img>`. */
|
396
536
|
GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
|
397
537
|
|
398
538
|
/**
|
399
|
-
* A flag for nodes that are cloned as a result of the reconstruction
|
400
|
-
* active formatting elements.
|
401
|
-
* portion of the formatting run is a NORMAL node with an
|
539
|
+
* A flag for nodes that are cloned as a result of the reconstruction
|
540
|
+
* of active formatting elements. This is set only on the clone; the
|
541
|
+
* initial portion of the formatting run is a NORMAL node with an
|
542
|
+
* `IMPLICIT_END_TAG`.
|
402
543
|
*/
|
403
544
|
GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
|
404
545
|
|
@@ -415,18 +556,19 @@ typedef enum {
|
|
415
556
|
GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
|
416
557
|
} GumboParseFlags;
|
417
558
|
|
418
|
-
/**
|
419
|
-
* Information specific to document nodes.
|
420
|
-
*/
|
559
|
+
/** Information specific to document nodes. */
|
421
560
|
typedef struct {
|
422
561
|
/**
|
423
|
-
* An array of
|
424
|
-
* normally consist of the
|
425
|
-
* Pointers are owned.
|
562
|
+
* An array of `GumboNode`s, containing the children of this element.
|
563
|
+
* This will normally consist of the `<html>` element and any comment
|
564
|
+
* nodes found. Pointers are owned.
|
426
565
|
*/
|
427
566
|
GumboVector /* GumboNode* */ children;
|
428
567
|
|
429
|
-
|
568
|
+
/**
|
569
|
+
* `true` if there was an explicit doctype token, as opposed to it
|
570
|
+
* being omitted.
|
571
|
+
*/
|
430
572
|
bool has_doctype;
|
431
573
|
|
432
574
|
// Fields from the doctype token, copied verbatim.
|
@@ -435,65 +577,70 @@ typedef struct {
|
|
435
577
|
const char* system_identifier;
|
436
578
|
|
437
579
|
/**
|
438
|
-
* Whether or not the document is in QuirksMode, as determined by the
|
439
|
-
* in the GumboTokenDocType template.
|
580
|
+
* Whether or not the document is in QuirksMode, as determined by the
|
581
|
+
* values in the GumboTokenDocType template.
|
440
582
|
*/
|
441
583
|
GumboQuirksModeEnum doc_type_quirks_mode;
|
442
584
|
} GumboDocument;
|
443
585
|
|
444
586
|
/**
|
445
|
-
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
|
446
|
-
* This contains just a block of text and its position.
|
587
|
+
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
|
588
|
+
* elements. This contains just a block of text and its position.
|
447
589
|
*/
|
448
590
|
typedef struct {
|
449
591
|
/**
|
450
|
-
* The text of this node, after entities have been parsed and decoded.
|
451
|
-
* comment
|
592
|
+
* The text of this node, after entities have been parsed and decoded.
|
593
|
+
* For comment and cdata nodes, this does not include the comment
|
594
|
+
* delimiters.
|
452
595
|
*/
|
453
596
|
const char* text;
|
454
597
|
|
455
598
|
/**
|
456
|
-
* The original text of this node, as a pointer into the original
|
457
|
-
* comment/cdata nodes, this includes the comment
|
599
|
+
* The original text of this node, as a pointer into the original
|
600
|
+
* buffer. For comment/cdata nodes, this includes the comment
|
601
|
+
* delimiters.
|
458
602
|
*/
|
459
603
|
GumboStringPiece original_text;
|
460
604
|
|
461
605
|
/**
|
462
|
-
* The starting position of this node.
|
463
|
-
* original_text
|
606
|
+
* The starting position of this node. This corresponds to the
|
607
|
+
* position of `original_text`, before entities are decoded.
|
464
608
|
* */
|
465
609
|
GumboSourcePosition start_pos;
|
466
610
|
} GumboText;
|
467
611
|
|
468
612
|
/**
|
469
|
-
* The struct used to represent all HTML elements.
|
470
|
-
* about the tag, attributes, and child nodes.
|
613
|
+
* The struct used to represent all HTML elements. This contains
|
614
|
+
* information about the tag, attributes, and child nodes.
|
471
615
|
*/
|
472
616
|
typedef struct {
|
473
617
|
/**
|
474
|
-
* An array of
|
475
|
-
* are owned.
|
618
|
+
* An array of `GumboNode`s, containing the children of this element.
|
619
|
+
* Pointers are owned.
|
476
620
|
*/
|
477
621
|
GumboVector /* GumboNode* */ children;
|
478
622
|
|
479
623
|
/** The GumboTag enum for this element. */
|
480
624
|
GumboTag tag;
|
481
625
|
|
626
|
+
/** The name for this element. */
|
627
|
+
const char* name;
|
628
|
+
|
482
629
|
/** The GumboNamespaceEnum for this element. */
|
483
630
|
GumboNamespaceEnum tag_namespace;
|
484
631
|
|
485
632
|
/**
|
486
|
-
* A GumboStringPiece pointing to the original tag text for this
|
487
|
-
* pointing directly into the source buffer.
|
488
|
-
* algorithmically (for example,
|
489
|
-
* zero-length string.
|
633
|
+
* A `GumboStringPiece` pointing to the original tag text for this
|
634
|
+
* element, pointing directly into the source buffer. If the tag was
|
635
|
+
* inserted algorithmically (for example, `<head>` or `<tbody>`
|
636
|
+
* insertion), this will be a zero-length string.
|
490
637
|
*/
|
491
638
|
GumboStringPiece original_tag;
|
492
639
|
|
493
640
|
/**
|
494
|
-
* A GumboStringPiece pointing to the original end tag text for this
|
495
|
-
* If the end tag was inserted algorithmically, (for example,
|
496
|
-
* self-closing tag), this will be a zero-length string.
|
641
|
+
* A `GumboStringPiece` pointing to the original end tag text for this
|
642
|
+
* element. If the end tag was inserted algorithmically, (for example,
|
643
|
+
* closing a self-closing tag), this will be a zero-length string.
|
497
644
|
*/
|
498
645
|
GumboStringPiece original_end_tag;
|
499
646
|
|
@@ -504,30 +651,31 @@ typedef struct {
|
|
504
651
|
GumboSourcePosition end_pos;
|
505
652
|
|
506
653
|
/**
|
507
|
-
* An array of
|
508
|
-
* order that they were parsed.
|
654
|
+
* An array of `GumboAttribute`s, containing the attributes for this
|
655
|
+
* tag in the order that they were parsed. Pointers are owned.
|
509
656
|
*/
|
510
657
|
GumboVector /* GumboAttribute* */ attributes;
|
511
658
|
} GumboElement;
|
512
659
|
|
513
660
|
/**
|
514
|
-
* A supertype for GumboElement and GumboText
|
515
|
-
* generic type in lists of children and cast as necessary
|
661
|
+
* A supertype for `GumboElement` and `GumboText`, so that we can
|
662
|
+
* include one generic type in lists of children and cast as necessary
|
663
|
+
* to subtypes.
|
516
664
|
*/
|
517
665
|
struct GumboInternalNode {
|
518
666
|
/** The type of node that this is. */
|
519
667
|
GumboNodeType type;
|
520
668
|
|
521
|
-
/** Pointer back to parent node.
|
669
|
+
/** Pointer back to parent node. Not owned. */
|
522
670
|
GumboNode* parent;
|
523
671
|
|
524
672
|
/** The index within the parent's children vector of this node. */
|
525
|
-
|
673
|
+
unsigned int index_within_parent;
|
526
674
|
|
527
675
|
/**
|
528
|
-
* A bitvector of flags containing information about why this element
|
529
|
-
* inserted into the parse tree, including a variety of special
|
530
|
-
* situations.
|
676
|
+
* A bitvector of flags containing information about why this element
|
677
|
+
* was inserted into the parse tree, including a variety of special
|
678
|
+
* parse situations.
|
531
679
|
*/
|
532
680
|
GumboParseFlags parse_flags;
|
533
681
|
|
@@ -539,133 +687,257 @@ struct GumboInternalNode {
|
|
539
687
|
} v;
|
540
688
|
};
|
541
689
|
|
542
|
-
/**
|
543
|
-
* The type for an allocator function. Takes the 'userdata' member of the
|
544
|
-
* GumboParser struct as its first argument. Semantics should be the same as
|
545
|
-
* malloc, i.e. return a block of size_t bytes on success or NULL on failure.
|
546
|
-
* Allocating a block of 0 bytes behaves as per malloc.
|
547
|
-
*/
|
548
|
-
// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
|
549
|
-
typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
|
550
|
-
|
551
|
-
/**
|
552
|
-
* The type for a deallocator function. Takes the 'userdata' member of the
|
553
|
-
* GumboParser struct as its first argument.
|
554
|
-
*/
|
555
|
-
typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
|
556
|
-
|
557
690
|
/**
|
558
691
|
* Input struct containing configuration options for the parser.
|
559
|
-
* These let you specify alternate memory managers, provide different
|
560
|
-
* handling, etc.
|
561
|
-
*
|
692
|
+
* These let you specify alternate memory managers, provide different
|
693
|
+
* error handling, etc. Use `kGumboDefaultOptions` for sensible
|
694
|
+
* defaults and only set what you need.
|
562
695
|
*/
|
563
696
|
typedef struct GumboInternalOptions {
|
564
|
-
/**
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
697
|
+
/**
|
698
|
+
* The tab-stop size, for computing positions in HTML files that
|
699
|
+
* use tabs. Default: `8`.
|
700
|
+
*/
|
701
|
+
int tab_stop;
|
569
702
|
|
570
703
|
/**
|
571
|
-
*
|
572
|
-
*
|
704
|
+
* Whether or not to stop parsing when the first error is encountered.
|
705
|
+
* Default: `false`.
|
573
706
|
*/
|
574
|
-
|
707
|
+
bool stop_on_first_error;
|
575
708
|
|
576
709
|
/**
|
577
|
-
*
|
578
|
-
*
|
710
|
+
* Maximum allowed number of attributes per element. If this limit is
|
711
|
+
* exceeded, the parser will return early with a partial document and
|
712
|
+
* the returned `GumboOutput` will have its `status` field set to
|
713
|
+
* `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
|
714
|
+
* Default: `400`.
|
579
715
|
*/
|
580
|
-
int
|
716
|
+
int max_attributes;
|
581
717
|
|
582
718
|
/**
|
583
|
-
*
|
584
|
-
*
|
719
|
+
* Maximum allowed depth for the parse tree. If this limit is exceeded,
|
720
|
+
* the parser will return early with a partial document and the returned
|
721
|
+
* `GumboOutput` will have its `status` field set to
|
722
|
+
* `GUMBO_STATUS_TREE_TOO_DEEP`.
|
723
|
+
* Default: `400`.
|
585
724
|
*/
|
586
|
-
|
725
|
+
unsigned int max_tree_depth;
|
587
726
|
|
588
727
|
/**
|
589
|
-
* The maximum number of errors before the parser stops recording
|
590
|
-
* is provided so that if the page is totally borked, we
|
591
|
-
* up the errors vector and exhaust memory with
|
592
|
-
* to
|
593
|
-
* Default:
|
728
|
+
* The maximum number of errors before the parser stops recording
|
729
|
+
* them. This is provided so that if the page is totally borked, we
|
730
|
+
* don't completely fill up the errors vector and exhaust memory with
|
731
|
+
* useless redundant errors. Set to `-1` to disable the limit.
|
732
|
+
* Default: `-1`.
|
594
733
|
*/
|
595
734
|
int max_errors;
|
596
735
|
|
597
736
|
/**
|
598
737
|
* The fragment context for parsing:
|
599
|
-
* https://html.spec.whatwg.org/multipage/
|
738
|
+
* https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
|
600
739
|
*
|
601
|
-
* If
|
602
|
-
* the regular parsing algorithm.
|
603
|
-
* intended parent of the parsed fragment.
|
604
|
-
*
|
605
|
-
*
|
606
|
-
* if parsing a fragment even when a full HTML tree isn't available.
|
740
|
+
* If `NULL` is passed here, it is assumed to be "no
|
741
|
+
* fragment", i.e. the regular parsing algorithm. Otherwise, pass the
|
742
|
+
* tag name for the intended parent of the parsed fragment. We use the
|
743
|
+
* tag name, namespace, and encoding attribute which are sufficient to
|
744
|
+
* set all of the parsing context needed for fragment parsing.
|
607
745
|
*
|
608
|
-
* Default:
|
746
|
+
* Default: `NULL`.
|
609
747
|
*/
|
610
|
-
|
748
|
+
const char* fragment_context;
|
611
749
|
|
612
750
|
/**
|
613
|
-
* The namespace for the fragment context.
|
614
|
-
* differentiate between, say, parsing a
|
615
|
-
* HTML.
|
616
|
-
*
|
751
|
+
* The namespace for the fragment context. This lets client code
|
752
|
+
* differentiate between, say, parsing a `<title>` tag in SVG vs.
|
753
|
+
* parsing it in HTML.
|
754
|
+
*
|
755
|
+
* Default: `GUMBO_NAMESPACE_HTML`.
|
617
756
|
*/
|
618
757
|
GumboNamespaceEnum fragment_namespace;
|
758
|
+
|
759
|
+
/**
|
760
|
+
* The value of the fragment context's `encoding` attribute, if any.
|
761
|
+
* Set to `NULL` for no `encoding` attribute.
|
762
|
+
*
|
763
|
+
* Default: `NULL`.
|
764
|
+
*/
|
765
|
+
const char* fragment_encoding;
|
766
|
+
|
767
|
+
/**
|
768
|
+
* Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
|
769
|
+
* be looked up using `gumbo_compute_quirks_mode()`.
|
770
|
+
*
|
771
|
+
* Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
|
772
|
+
*/
|
773
|
+
GumboQuirksModeEnum quirks_mode;
|
774
|
+
|
775
|
+
/**
|
776
|
+
* For fragment parsing. Set this to true if the context node has a form
|
777
|
+
* element as an ancestor.
|
778
|
+
*
|
779
|
+
* Default: `false`.
|
780
|
+
*/
|
781
|
+
bool fragment_context_has_form_ancestor;
|
619
782
|
} GumboOptions;
|
620
783
|
|
621
784
|
/** Default options struct; use this with gumbo_parse_with_options. */
|
622
785
|
extern const GumboOptions kGumboDefaultOptions;
|
623
786
|
|
787
|
+
/**
|
788
|
+
* Status code indicating whether parsing finished successfully or
|
789
|
+
* was stopped mid-document due to exceptional circumstances.
|
790
|
+
*/
|
791
|
+
typedef enum {
|
792
|
+
/**
|
793
|
+
* Indicates that parsing completed successfuly. The resulting tree
|
794
|
+
* will be a complete document.
|
795
|
+
*/
|
796
|
+
GUMBO_STATUS_OK,
|
797
|
+
|
798
|
+
/**
|
799
|
+
* Indicates that the maximum element nesting limit
|
800
|
+
* (`GumboOptions::max_tree_depth`) was reached during parsing. The
|
801
|
+
* resulting tree will be a partial document, with no further nodes
|
802
|
+
* created after the point where the limit was reached. The partial
|
803
|
+
* document may be useful for constructing an error message but
|
804
|
+
* typically shouldn't be used for other purposes.
|
805
|
+
*/
|
806
|
+
GUMBO_STATUS_TREE_TOO_DEEP,
|
807
|
+
|
808
|
+
/**
|
809
|
+
* Indicates that the maximum number of attributes per element
|
810
|
+
* (`GumboOptions::max_attributes`) was reached during parsing. The
|
811
|
+
* resulting tree will be a partial document, with no further nodes
|
812
|
+
* created after the point where the limit was reached. The partial
|
813
|
+
* document may be useful for constructing an error message but
|
814
|
+
* typically shouldn't be used for other purposes.
|
815
|
+
*/
|
816
|
+
GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
|
817
|
+
|
818
|
+
// Currently unused
|
819
|
+
GUMBO_STATUS_OUT_OF_MEMORY,
|
820
|
+
} GumboOutputStatus;
|
821
|
+
|
822
|
+
|
624
823
|
/** The output struct containing the results of the parse. */
|
625
824
|
typedef struct GumboInternalOutput {
|
626
825
|
/**
|
627
|
-
* Pointer to the document node.
|
628
|
-
* that contains the entire document as its child.
|
826
|
+
* Pointer to the document node. This is a `GumboNode` of type
|
827
|
+
* `NODE_DOCUMENT` that contains the entire document as its child.
|
629
828
|
*/
|
630
829
|
GumboNode* document;
|
631
830
|
|
632
831
|
/**
|
633
|
-
* Pointer to the root node.
|
634
|
-
* document.
|
832
|
+
* Pointer to the root node. This is the `<html>` tag that forms the
|
833
|
+
* root of the document.
|
635
834
|
*/
|
636
835
|
GumboNode* root;
|
637
836
|
|
638
837
|
/**
|
639
838
|
* A list of errors that occurred during the parse.
|
640
|
-
* NOTE: In version 1.0 of this library, the API for errors hasn't been fully
|
641
|
-
* fleshed out and may change in the future. For this reason, the GumboError
|
642
|
-
* header isn't part of the public API. Contact us if you need errors
|
643
|
-
* reported so we can work out something appropriate for your use-case.
|
644
839
|
*/
|
645
840
|
GumboVector /* GumboError */ errors;
|
841
|
+
|
842
|
+
/**
|
843
|
+
* True if the parser encounted an error.
|
844
|
+
*
|
845
|
+
* This can be true and `errors` an empty `GumboVector` if the `max_errors`
|
846
|
+
* option was set to 0.
|
847
|
+
*/
|
848
|
+
bool document_error;
|
849
|
+
|
850
|
+
/**
|
851
|
+
* A status code indicating whether parsing finished successfully or was
|
852
|
+
* stopped mid-document due to exceptional circumstances.
|
853
|
+
*/
|
854
|
+
GumboOutputStatus status;
|
646
855
|
} GumboOutput;
|
647
856
|
|
648
857
|
/**
|
649
|
-
* Parses a buffer of
|
650
|
-
* live at least as long as the parse tree, as some fields
|
651
|
-
* point directly into the original buffer.
|
858
|
+
* Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
|
859
|
+
* buffer must live at least as long as the parse tree, as some fields
|
860
|
+
* (eg. `original_text`) point directly into the original buffer.
|
652
861
|
*
|
653
862
|
* This doesn't support buffers longer than 4 gigabytes.
|
654
863
|
*/
|
655
864
|
GumboOutput* gumbo_parse(const char* buffer);
|
656
865
|
|
657
866
|
/**
|
658
|
-
* Extended version of gumbo_parse that takes an explicit options
|
659
|
-
* buffer, and length.
|
867
|
+
* Extended version of `gumbo_parse` that takes an explicit options
|
868
|
+
* structure, buffer, and length.
|
869
|
+
*/
|
870
|
+
GumboOutput* gumbo_parse_with_options (
|
871
|
+
const GumboOptions* options,
|
872
|
+
const char* buffer,
|
873
|
+
size_t buffer_length
|
874
|
+
);
|
875
|
+
|
876
|
+
/**
|
877
|
+
* Compute the quirks mode based on the name, public identifier, and system
|
878
|
+
* identifier. Any of these may be `NULL` to indicate a missing value.
|
879
|
+
*/
|
880
|
+
GumboQuirksModeEnum gumbo_compute_quirks_mode (
|
881
|
+
const char *name,
|
882
|
+
const char *pubid,
|
883
|
+
const char *sysid
|
884
|
+
);
|
885
|
+
|
886
|
+
/** Convert a `GumboOutputStatus` code into a readable description. */
|
887
|
+
const char* gumbo_status_to_string(GumboOutputStatus status);
|
888
|
+
|
889
|
+
/** Release the memory used for the parse tree and parse errors. */
|
890
|
+
void gumbo_destroy_output(GumboOutput* output);
|
891
|
+
|
892
|
+
/** Opaque GumboError type */
|
893
|
+
typedef struct GumboInternalError GumboError;
|
894
|
+
|
895
|
+
/**
|
896
|
+
* Returns the position of the error.
|
660
897
|
*/
|
661
|
-
|
662
|
-
const GumboOptions* options, const char* buffer, size_t buffer_length);
|
898
|
+
GumboSourcePosition gumbo_error_position(const GumboError* error);
|
663
899
|
|
664
|
-
/**
|
665
|
-
|
900
|
+
/**
|
901
|
+
* Returns a constant string representation of the error's code. This is owned
|
902
|
+
* by the library and should not be freed by the caller.
|
903
|
+
*/
|
904
|
+
const char* gumbo_error_code(const GumboError* error);
|
905
|
+
|
906
|
+
/**
|
907
|
+
* Prints an error to a string. This stores a freshly-allocated buffer
|
908
|
+
* containing the error message text in output. The caller is responsible for
|
909
|
+
* freeing the buffer. The size of the error message is returned. The error
|
910
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
911
|
+
* returned size must be used.
|
912
|
+
*/
|
913
|
+
size_t gumbo_error_to_string(const GumboError* error, char **output);
|
914
|
+
|
915
|
+
/**
|
916
|
+
* Prints a caret diagnostic to a string. This stores a freshly-allocated
|
917
|
+
* buffer containing the error message text in output. The caller is responsible for
|
918
|
+
* freeing the buffer. The size of the error message is returned. The error
|
919
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
920
|
+
* returned size must be used.
|
921
|
+
*/
|
922
|
+
size_t gumbo_caret_diagnostic_to_string (
|
923
|
+
const GumboError* error,
|
924
|
+
const char* source_text,
|
925
|
+
size_t source_length,
|
926
|
+
char** output
|
927
|
+
);
|
928
|
+
|
929
|
+
/**
|
930
|
+
* Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
|
931
|
+
* instead of writing to a string.
|
932
|
+
*/
|
933
|
+
void gumbo_print_caret_diagnostic (
|
934
|
+
const GumboError* error,
|
935
|
+
const char* source_text,
|
936
|
+
size_t source_length
|
937
|
+
);
|
666
938
|
|
667
939
|
#ifdef __cplusplus
|
668
940
|
}
|
669
941
|
#endif
|
670
942
|
|
671
|
-
#endif
|
943
|
+
#endif // GUMBO_H
|