nokogiri 1.11.7 → 1.12.1
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/LICENSE-DEPENDENCIES.md +243 -22
- data/LICENSE.md +1 -1
- data/README.md +6 -5
- data/ext/nokogiri/depend +35 -34
- data/ext/nokogiri/extconf.rb +181 -103
- data/ext/nokogiri/gumbo.c +584 -0
- data/ext/nokogiri/{html_document.c → html4_document.c} +8 -8
- data/ext/nokogiri/{html_element_description.c → html4_element_description.c} +20 -18
- data/ext/nokogiri/{html_entity_lookup.c → html4_entity_lookup.c} +7 -7
- data/ext/nokogiri/{html_sax_parser_context.c → html4_sax_parser_context.c} +5 -5
- data/ext/nokogiri/{html_sax_push_parser.c → html4_sax_push_parser.c} +4 -4
- data/ext/nokogiri/libxml2_backwards_compat.c +30 -30
- data/ext/nokogiri/nokogiri.c +51 -38
- data/ext/nokogiri/nokogiri.h +16 -9
- data/ext/nokogiri/xml_document.c +13 -13
- data/ext/nokogiri/xml_element_content.c +2 -0
- data/ext/nokogiri/xml_encoding_handler.c +11 -6
- data/ext/nokogiri/xml_namespace.c +2 -0
- data/ext/nokogiri/xml_node.c +102 -102
- data/ext/nokogiri/xml_node_set.c +20 -20
- data/ext/nokogiri/xml_reader.c +2 -0
- data/ext/nokogiri/xml_sax_parser.c +6 -6
- data/ext/nokogiri/xml_sax_parser_context.c +2 -0
- data/ext/nokogiri/xml_schema.c +2 -0
- data/ext/nokogiri/xml_xpath_context.c +67 -65
- data/ext/nokogiri/xslt_stylesheet.c +2 -1
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +101 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +626 -0
- data/gumbo-parser/src/error.h +148 -0
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/gumbo.h +943 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +4886 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +222 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +169 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3463 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +68 -0
- data/gumbo-parser/src/util.h +30 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri.rb +31 -29
- data/lib/nokogiri/css.rb +14 -14
- data/lib/nokogiri/css/parser.rb +1 -1
- data/lib/nokogiri/css/parser.y +1 -1
- data/lib/nokogiri/css/syntax_error.rb +1 -1
- data/lib/nokogiri/extension.rb +2 -2
- data/lib/nokogiri/gumbo.rb +14 -0
- data/lib/nokogiri/html.rb +31 -27
- data/lib/nokogiri/html4.rb +40 -0
- data/lib/nokogiri/{html → html4}/builder.rb +2 -2
- data/lib/nokogiri/{html → html4}/document.rb +4 -4
- data/lib/nokogiri/{html → html4}/document_fragment.rb +3 -3
- data/lib/nokogiri/{html → html4}/element_description.rb +1 -1
- data/lib/nokogiri/{html → html4}/element_description_defaults.rb +1 -1
- data/lib/nokogiri/{html → html4}/entity_lookup.rb +1 -1
- data/lib/nokogiri/{html → html4}/sax/parser.rb +11 -14
- data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
- data/lib/nokogiri/{html → html4}/sax/push_parser.rb +5 -5
- data/lib/nokogiri/html5.rb +473 -0
- data/lib/nokogiri/html5/document.rb +74 -0
- data/lib/nokogiri/html5/document_fragment.rb +80 -0
- data/lib/nokogiri/html5/node.rb +93 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -2
- data/lib/nokogiri/xml.rb +35 -36
- data/lib/nokogiri/xml/node.rb +6 -5
- data/lib/nokogiri/xml/parse_options.rb +2 -0
- data/lib/nokogiri/xml/pp.rb +2 -2
- data/lib/nokogiri/xml/sax.rb +4 -4
- data/lib/nokogiri/xml/sax/document.rb +24 -30
- data/lib/nokogiri/xml/xpath.rb +2 -2
- data/lib/nokogiri/xslt.rb +16 -16
- data/lib/nokogiri/xslt/stylesheet.rb +1 -1
- metadata +100 -58
- data/lib/nokogiri/html/sax/parser_context.rb +0 -17
@@ -0,0 +1,148 @@
|
|
1
|
+
#ifndef GUMBO_ERROR_H_
|
2
|
+
#define GUMBO_ERROR_H_
|
3
|
+
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
#include "gumbo.h"
|
7
|
+
#include "insertion_mode.h"
|
8
|
+
#include "string_buffer.h"
|
9
|
+
#include "token_type.h"
|
10
|
+
#include "tokenizer_states.h"
|
11
|
+
|
12
|
+
#ifdef __cplusplus
|
13
|
+
extern "C" {
|
14
|
+
#endif
|
15
|
+
|
16
|
+
struct GumboInternalParser;
|
17
|
+
|
18
|
+
typedef enum {
|
19
|
+
// Defined errors.
|
20
|
+
// https://html.spec.whatwg.org/multipage/parsing.html#parse-errors
|
21
|
+
GUMBO_ERR_ABRUPT_CLOSING_OF_EMPTY_COMMENT,
|
22
|
+
GUMBO_ERR_ABRUPT_DOCTYPE_PUBLIC_IDENTIFIER,
|
23
|
+
GUMBO_ERR_ABRUPT_DOCTYPE_SYSTEM_IDENTIFIER,
|
24
|
+
GUMBO_ERR_ABSENCE_OF_DIGITS_IN_NUMERIC_CHARACTER_REFERENCE,
|
25
|
+
GUMBO_ERR_CDATA_IN_HTML_CONTENT,
|
26
|
+
GUMBO_ERR_CHARACTER_REFERENCE_OUTSIDE_UNICODE_RANGE,
|
27
|
+
GUMBO_ERR_CONTROL_CHARACTER_IN_INPUT_STREAM,
|
28
|
+
GUMBO_ERR_CONTROL_CHARACTER_REFERENCE,
|
29
|
+
GUMBO_ERR_END_TAG_WITH_ATTRIBUTES,
|
30
|
+
GUMBO_ERR_DUPLICATE_ATTRIBUTE,
|
31
|
+
GUMBO_ERR_END_TAG_WITH_TRAILING_SOLIDUS,
|
32
|
+
GUMBO_ERR_EOF_BEFORE_TAG_NAME,
|
33
|
+
GUMBO_ERR_EOF_IN_CDATA,
|
34
|
+
GUMBO_ERR_EOF_IN_COMMENT,
|
35
|
+
GUMBO_ERR_EOF_IN_DOCTYPE,
|
36
|
+
GUMBO_ERR_EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT,
|
37
|
+
GUMBO_ERR_EOF_IN_TAG,
|
38
|
+
GUMBO_ERR_INCORRECTLY_CLOSED_COMMENT,
|
39
|
+
GUMBO_ERR_INCORRECTLY_OPENED_COMMENT,
|
40
|
+
GUMBO_ERR_INVALID_CHARACTER_SEQUENCE_AFTER_DOCTYPE_NAME,
|
41
|
+
GUMBO_ERR_INVALID_FIRST_CHARACTER_OF_TAG_NAME,
|
42
|
+
GUMBO_ERR_MISSING_ATTRIBUTE_VALUE,
|
43
|
+
GUMBO_ERR_MISSING_DOCTYPE_NAME,
|
44
|
+
GUMBO_ERR_MISSING_DOCTYPE_PUBLIC_IDENTIFIER,
|
45
|
+
GUMBO_ERR_MISSING_DOCTYPE_SYSTEM_IDENTIFIER,
|
46
|
+
GUMBO_ERR_MISSING_END_TAG_NAME,
|
47
|
+
GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_PUBLIC_IDENTIFIER,
|
48
|
+
GUMBO_ERR_MISSING_QUOTE_BEFORE_DOCTYPE_SYSTEM_IDENTIFIER,
|
49
|
+
GUMBO_ERR_MISSING_SEMICOLON_AFTER_CHARACTER_REFERENCE,
|
50
|
+
GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_PUBLIC_KEYWORD,
|
51
|
+
GUMBO_ERR_MISSING_WHITESPACE_AFTER_DOCTYPE_SYSTEM_KEYWORD,
|
52
|
+
GUMBO_ERR_MISSING_WHITESPACE_BEFORE_DOCTYPE_NAME,
|
53
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
|
54
|
+
GUMBO_ERR_MISSING_WHITESPACE_BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS,
|
55
|
+
GUMBO_ERR_NESTED_COMMENT,
|
56
|
+
GUMBO_ERR_NONCHARACTER_CHARACTER_REFERENCE,
|
57
|
+
GUMBO_ERR_NONCHARACTER_IN_INPUT_STREAM,
|
58
|
+
GUMBO_ERR_NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
|
59
|
+
GUMBO_ERR_NULL_CHARACTER_REFERENCE,
|
60
|
+
GUMBO_ERR_SURROGATE_CHARACTER_REFERENCE,
|
61
|
+
GUMBO_ERR_SURROGATE_IN_INPUT_STREAM,
|
62
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_AFTER_DOCTYPE_SYSTEM_IDENTIFIER,
|
63
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_IN_ATTRIBUTE_NAME,
|
64
|
+
GUMBO_ERR_UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE_VALUE,
|
65
|
+
GUMBO_ERR_UNEXPECTED_EQUALS_SIGN_BEFORE_ATTRIBUTE_NAME,
|
66
|
+
GUMBO_ERR_UNEXPECTED_NULL_CHARACTER,
|
67
|
+
GUMBO_ERR_UNEXPECTED_QUESTION_MARK_INSTEAD_OF_TAG_NAME,
|
68
|
+
GUMBO_ERR_UNEXPECTED_SOLIDUS_IN_TAG,
|
69
|
+
GUMBO_ERR_UNKNOWN_NAMED_CHARACTER_REFERENCE,
|
70
|
+
|
71
|
+
// Encoding errors.
|
72
|
+
GUMBO_ERR_UTF8_INVALID,
|
73
|
+
GUMBO_ERR_UTF8_TRUNCATED,
|
74
|
+
|
75
|
+
// Generic parser error.
|
76
|
+
GUMBO_ERR_PARSER,
|
77
|
+
} GumboErrorType;
|
78
|
+
|
79
|
+
// Additional data for tokenizer errors.
|
80
|
+
// This records the current state and codepoint encountered - this is usually
|
81
|
+
// enough to reconstruct what went wrong and provide a friendly error message.
|
82
|
+
typedef struct GumboInternalTokenizerError {
|
83
|
+
// The bad codepoint encountered.
|
84
|
+
int codepoint;
|
85
|
+
|
86
|
+
// The state that the tokenizer was in at the time.
|
87
|
+
GumboTokenizerEnum state;
|
88
|
+
} GumboTokenizerError;
|
89
|
+
|
90
|
+
// Additional data for parse errors.
|
91
|
+
typedef struct GumboInternalParserError {
|
92
|
+
// The type of input token that resulted in this error.
|
93
|
+
GumboTokenType input_type;
|
94
|
+
|
95
|
+
// The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
|
96
|
+
GumboTag input_tag;
|
97
|
+
|
98
|
+
// The insertion mode that the parser was in at the time.
|
99
|
+
GumboInsertionMode parser_state;
|
100
|
+
|
101
|
+
// The tag stack at the point of the error. Note that this is an GumboVector
|
102
|
+
// of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
|
103
|
+
// get at the tag.
|
104
|
+
GumboVector /* GumboTag */ tag_stack;
|
105
|
+
} GumboParserError;
|
106
|
+
|
107
|
+
// The overall error struct representing an error in decoding/tokenizing/parsing
|
108
|
+
// the HTML. This contains an enumerated type flag, a source position, and then
|
109
|
+
// a union of fields containing data specific to the error.
|
110
|
+
struct GumboInternalError {
|
111
|
+
// The type of error.
|
112
|
+
GumboErrorType type;
|
113
|
+
|
114
|
+
// The position within the source file where the error occurred.
|
115
|
+
GumboSourcePosition position;
|
116
|
+
|
117
|
+
// The piece of text that caused the error.
|
118
|
+
GumboStringPiece original_text;
|
119
|
+
|
120
|
+
// Type-specific error information.
|
121
|
+
union {
|
122
|
+
// Tokenizer errors.
|
123
|
+
GumboTokenizerError tokenizer;
|
124
|
+
|
125
|
+
// Parser errors.
|
126
|
+
GumboParserError parser;
|
127
|
+
} v;
|
128
|
+
};
|
129
|
+
|
130
|
+
// Adds a new error to the parser's error list, and returns a pointer to it so
|
131
|
+
// that clients can fill out the rest of its fields. May return NULL if we're
|
132
|
+
// already over the max_errors field specified in GumboOptions.
|
133
|
+
GumboError* gumbo_add_error(struct GumboInternalParser* parser);
|
134
|
+
|
135
|
+
// Initializes the errors vector in the parser.
|
136
|
+
void gumbo_init_errors(struct GumboInternalParser* errors);
|
137
|
+
|
138
|
+
// Frees all the errors in the 'errors_' field of the parser.
|
139
|
+
void gumbo_destroy_errors(struct GumboInternalParser* errors);
|
140
|
+
|
141
|
+
// Frees the memory used for a single GumboError.
|
142
|
+
void gumbo_error_destroy(GumboError* error);
|
143
|
+
|
144
|
+
#ifdef __cplusplus
|
145
|
+
}
|
146
|
+
#endif
|
147
|
+
|
148
|
+
#endif // GUMBO_ERROR_H_
|
@@ -0,0 +1,104 @@
|
|
1
|
+
/* ANSI-C code produced by gperf version 3.1 */
|
2
|
+
/* Command-line: gperf -m100 -n lib/foreign_attrs.gperf */
|
3
|
+
/* Computed positions: -k'2,8' */
|
4
|
+
/* Filtered by: mk/gperf-filter.sed */
|
5
|
+
|
6
|
+
#include "replacement.h"
|
7
|
+
#include "macros.h"
|
8
|
+
#include <string.h>
|
9
|
+
|
10
|
+
#define TOTAL_KEYWORDS 11
|
11
|
+
#define MIN_WORD_LENGTH 5
|
12
|
+
#define MAX_WORD_LENGTH 13
|
13
|
+
#define MIN_HASH_VALUE 0
|
14
|
+
#define MAX_HASH_VALUE 10
|
15
|
+
/* maximum key range = 11, duplicates = 0 */
|
16
|
+
|
17
|
+
static inline unsigned int
|
18
|
+
hash (register const char *str, register size_t len)
|
19
|
+
{
|
20
|
+
static const unsigned char asso_values[] =
|
21
|
+
{
|
22
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
23
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
24
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
25
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
26
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
27
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
28
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
29
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
30
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
31
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 2,
|
32
|
+
11, 10, 11, 9, 7, 6, 11, 11, 1, 0,
|
33
|
+
11, 5, 11, 11, 4, 11, 11, 11, 11, 11,
|
34
|
+
11, 3, 11, 11, 11, 11, 11, 11, 11, 11,
|
35
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
36
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
37
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
38
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
39
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
40
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
41
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
42
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
43
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
44
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
45
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
46
|
+
11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
|
47
|
+
11, 11, 11, 11, 11, 11
|
48
|
+
};
|
49
|
+
register unsigned int hval = 0;
|
50
|
+
|
51
|
+
switch (len)
|
52
|
+
{
|
53
|
+
default:
|
54
|
+
hval += asso_values[(unsigned char)str[7]];
|
55
|
+
/*FALLTHROUGH*/
|
56
|
+
case 7:
|
57
|
+
case 6:
|
58
|
+
case 5:
|
59
|
+
case 4:
|
60
|
+
case 3:
|
61
|
+
case 2:
|
62
|
+
hval += asso_values[(unsigned char)str[1]];
|
63
|
+
break;
|
64
|
+
}
|
65
|
+
return hval;
|
66
|
+
}
|
67
|
+
|
68
|
+
const ForeignAttrReplacement *
|
69
|
+
gumbo_get_foreign_attr_replacement (register const char *str, register size_t len)
|
70
|
+
{
|
71
|
+
static const unsigned char lengthtable[] =
|
72
|
+
{
|
73
|
+
5, 11, 9, 13, 10, 10, 10, 11, 10, 8, 8
|
74
|
+
};
|
75
|
+
static const ForeignAttrReplacement wordlist[] =
|
76
|
+
{
|
77
|
+
{"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS},
|
78
|
+
{"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS},
|
79
|
+
{"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML},
|
80
|
+
{"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK},
|
81
|
+
{"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK},
|
82
|
+
{"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK},
|
83
|
+
{"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK},
|
84
|
+
{"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK},
|
85
|
+
{"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK},
|
86
|
+
{"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML},
|
87
|
+
{"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML}
|
88
|
+
};
|
89
|
+
|
90
|
+
if (len <= MAX_WORD_LENGTH && len >= MIN_WORD_LENGTH)
|
91
|
+
{
|
92
|
+
register unsigned int key = hash (str, len);
|
93
|
+
|
94
|
+
if (key <= MAX_HASH_VALUE)
|
95
|
+
if (len == lengthtable[key])
|
96
|
+
{
|
97
|
+
register const char *s = wordlist[key].from;
|
98
|
+
|
99
|
+
if (s && *str == *s && !memcmp (str + 1, s + 1, len - 1))
|
100
|
+
return &wordlist[key];
|
101
|
+
}
|
102
|
+
}
|
103
|
+
return 0;
|
104
|
+
}
|
@@ -0,0 +1,27 @@
|
|
1
|
+
%{
|
2
|
+
#include "replacement.h"
|
3
|
+
#include "macros.h"
|
4
|
+
%}
|
5
|
+
|
6
|
+
%struct-type
|
7
|
+
%omit-struct-type
|
8
|
+
%compare-lengths
|
9
|
+
%readonly-tables
|
10
|
+
%null-strings
|
11
|
+
%includes
|
12
|
+
%define lookup-function-name gumbo_get_foreign_attr_replacement
|
13
|
+
%define slot-name from
|
14
|
+
ForeignAttrReplacement;
|
15
|
+
|
16
|
+
%%
|
17
|
+
"xlink:actuate", "actuate", GUMBO_ATTR_NAMESPACE_XLINK
|
18
|
+
"xlink:href", "href", GUMBO_ATTR_NAMESPACE_XLINK
|
19
|
+
"xlink:role", "role", GUMBO_ATTR_NAMESPACE_XLINK
|
20
|
+
"xlink:show", "show", GUMBO_ATTR_NAMESPACE_XLINK
|
21
|
+
"xlink:title", "title", GUMBO_ATTR_NAMESPACE_XLINK
|
22
|
+
"xlink:type", "type", GUMBO_ATTR_NAMESPACE_XLINK
|
23
|
+
"xml:base", "base", GUMBO_ATTR_NAMESPACE_XML
|
24
|
+
"xml:lang", "lang", GUMBO_ATTR_NAMESPACE_XML
|
25
|
+
"xml:space", "space", GUMBO_ATTR_NAMESPACE_XML
|
26
|
+
"xmlns", "xmlns", GUMBO_ATTR_NAMESPACE_XMLNS
|
27
|
+
"xmlns:xlink", "xlink", GUMBO_ATTR_NAMESPACE_XMLNS
|
@@ -0,0 +1,943 @@
|
|
1
|
+
// Copyright 2010 Google Inc.
|
2
|
+
// Copyright 2018 Craig Barnes.
|
3
|
+
// Licensed under the Apache License, version 2.0.
|
4
|
+
|
5
|
+
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
|
6
|
+
// GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
|
7
|
+
// static constants
|
8
|
+
|
9
|
+
/**
|
10
|
+
* @file
|
11
|
+
* @mainpage Gumbo HTML Parser
|
12
|
+
*
|
13
|
+
* This provides a conformant, no-dependencies implementation of the
|
14
|
+
* [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
|
15
|
+
* to parse a different encoding, run a preprocessing step to convert
|
16
|
+
* to UTF-8. It returns a parse tree made of the structs in this file.
|
17
|
+
*
|
18
|
+
* Example:
|
19
|
+
* @code
|
20
|
+
* GumboOutput* output = gumbo_parse(input);
|
21
|
+
* do_something_with_doctype(output->document);
|
22
|
+
* do_something_with_html_tree(output->root);
|
23
|
+
* gumbo_destroy_output(output);
|
24
|
+
* @endcode
|
25
|
+
*
|
26
|
+
* [HTML5]: https://html.spec.whatwg.org/multipage/
|
27
|
+
*/
|
28
|
+
|
29
|
+
#ifndef GUMBO_H
|
30
|
+
#define GUMBO_H
|
31
|
+
|
32
|
+
#include <stdbool.h>
|
33
|
+
#include <stddef.h>
|
34
|
+
|
35
|
+
#ifdef __cplusplus
|
36
|
+
extern "C" {
|
37
|
+
#endif
|
38
|
+
|
39
|
+
/**
|
40
|
+
* A struct representing a character position within the original text
|
41
|
+
* buffer. Line and column numbers are 1-based and offsets are 0-based,
|
42
|
+
* which matches how most editors and command-line tools work.
|
43
|
+
*/
|
44
|
+
typedef struct {
|
45
|
+
size_t line;
|
46
|
+
size_t column;
|
47
|
+
size_t offset;
|
48
|
+
} GumboSourcePosition;
|
49
|
+
|
50
|
+
/**
|
51
|
+
* A struct representing a string or part of a string. Strings within
|
52
|
+
* the parser are represented by a `char*` and a length; the `char*`
|
53
|
+
* points into an existing data buffer owned by some other code (often
|
54
|
+
* the original input). `GumboStringPiece`s are assumed (by convention)
|
55
|
+
* to be immutable, because they may share data. Clients should assume
|
56
|
+
* that it is not NUL-terminated and should always use explicit lengths
|
57
|
+
* when manipulating them.
|
58
|
+
*/
|
59
|
+
typedef struct {
|
60
|
+
/** A pointer to the beginning of the string. `NULL` if `length == 0`. */
|
61
|
+
const char* data;
|
62
|
+
|
63
|
+
/** The length of the string fragment, in bytes (may be zero). */
|
64
|
+
size_t length;
|
65
|
+
} GumboStringPiece;
|
66
|
+
|
67
|
+
#define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
|
68
|
+
/** A constant to represent a 0-length null string. */
|
69
|
+
#define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
|
70
|
+
|
71
|
+
/**
|
72
|
+
* Compares two `GumboStringPiece`s, and returns `true` if they're
|
73
|
+
* equal or `false` otherwise.
|
74
|
+
*/
|
75
|
+
bool gumbo_string_equals (
|
76
|
+
const GumboStringPiece* str1,
|
77
|
+
const GumboStringPiece* str2
|
78
|
+
);
|
79
|
+
|
80
|
+
/**
|
81
|
+
* Compares two `GumboStringPiece`s, ignoring case, and returns `true`
|
82
|
+
* if they're equal or `false` otherwise.
|
83
|
+
*/
|
84
|
+
bool gumbo_string_equals_ignore_case (
|
85
|
+
const GumboStringPiece* str1,
|
86
|
+
const GumboStringPiece* str2
|
87
|
+
);
|
88
|
+
|
89
|
+
/**
|
90
|
+
* Check if the first `GumboStringPiece` is a prefix of the second, ignoring
|
91
|
+
* case.
|
92
|
+
*/
|
93
|
+
bool gumbo_string_prefix_ignore_case (
|
94
|
+
const GumboStringPiece* prefix,
|
95
|
+
const GumboStringPiece* str
|
96
|
+
);
|
97
|
+
|
98
|
+
/**
|
99
|
+
* A simple vector implementation. This stores a pointer to a data array
|
100
|
+
* and a length. All elements are stored as `void*`; client code must
|
101
|
+
* cast to the appropriate type. Overflows upon addition result in
|
102
|
+
* reallocation of the data array, with the size doubling to maintain
|
103
|
+
* `O(1)` amortized cost. There is no removal function, as this isn't
|
104
|
+
* needed for any of the operations within this library. Iteration can
|
105
|
+
* be done through inspecting the structure directly in a `for` loop.
|
106
|
+
*/
|
107
|
+
typedef struct {
|
108
|
+
/**
|
109
|
+
* Data elements. This points to a dynamically-allocated array of
|
110
|
+
* `capacity` elements, each a `void*` to the element itself.
|
111
|
+
*/
|
112
|
+
void** data;
|
113
|
+
|
114
|
+
/** Number of elements currently in the vector. */
|
115
|
+
unsigned int length;
|
116
|
+
|
117
|
+
/** Current array capacity. */
|
118
|
+
unsigned int capacity;
|
119
|
+
} GumboVector;
|
120
|
+
|
121
|
+
# define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
|
122
|
+
/** An empty (0-length, 0-capacity) `GumboVector`. */
|
123
|
+
#define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
|
124
|
+
|
125
|
+
/**
|
126
|
+
* Returns the first index at which an element appears in this vector
|
127
|
+
* (testing by pointer equality), or `-1` if it never does.
|
128
|
+
*/
|
129
|
+
int gumbo_vector_index_of(GumboVector* vector, const void* element);
|
130
|
+
|
131
|
+
/**
|
132
|
+
* An `enum` for all the tags defined in the HTML5 standard. These
|
133
|
+
* correspond to the tag names themselves. Enum constants exist only
|
134
|
+
* for tags that appear in the spec itself (or for tags with special
|
135
|
+
* handling in the SVG and MathML namespaces). Any other tags appear
|
136
|
+
* as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
|
137
|
+
* through `original_tag`.
|
138
|
+
*
|
139
|
+
* This is mostly for API convenience, so that clients of this library
|
140
|
+
* don't need to perform a `strcasecmp` to find the normalized tag
|
141
|
+
* name. It also has efficiency benefits, by letting the parser work
|
142
|
+
* with enums instead of strings.
|
143
|
+
*/
|
144
|
+
typedef enum {
|
145
|
+
GUMBO_TAG_HTML,
|
146
|
+
GUMBO_TAG_HEAD,
|
147
|
+
GUMBO_TAG_TITLE,
|
148
|
+
GUMBO_TAG_BASE,
|
149
|
+
GUMBO_TAG_LINK,
|
150
|
+
GUMBO_TAG_META,
|
151
|
+
GUMBO_TAG_STYLE,
|
152
|
+
GUMBO_TAG_SCRIPT,
|
153
|
+
GUMBO_TAG_NOSCRIPT,
|
154
|
+
GUMBO_TAG_TEMPLATE,
|
155
|
+
GUMBO_TAG_BODY,
|
156
|
+
GUMBO_TAG_ARTICLE,
|
157
|
+
GUMBO_TAG_SECTION,
|
158
|
+
GUMBO_TAG_NAV,
|
159
|
+
GUMBO_TAG_ASIDE,
|
160
|
+
GUMBO_TAG_H1,
|
161
|
+
GUMBO_TAG_H2,
|
162
|
+
GUMBO_TAG_H3,
|
163
|
+
GUMBO_TAG_H4,
|
164
|
+
GUMBO_TAG_H5,
|
165
|
+
GUMBO_TAG_H6,
|
166
|
+
GUMBO_TAG_HGROUP,
|
167
|
+
GUMBO_TAG_HEADER,
|
168
|
+
GUMBO_TAG_FOOTER,
|
169
|
+
GUMBO_TAG_ADDRESS,
|
170
|
+
GUMBO_TAG_P,
|
171
|
+
GUMBO_TAG_HR,
|
172
|
+
GUMBO_TAG_PRE,
|
173
|
+
GUMBO_TAG_BLOCKQUOTE,
|
174
|
+
GUMBO_TAG_OL,
|
175
|
+
GUMBO_TAG_UL,
|
176
|
+
GUMBO_TAG_LI,
|
177
|
+
GUMBO_TAG_DL,
|
178
|
+
GUMBO_TAG_DT,
|
179
|
+
GUMBO_TAG_DD,
|
180
|
+
GUMBO_TAG_FIGURE,
|
181
|
+
GUMBO_TAG_FIGCAPTION,
|
182
|
+
GUMBO_TAG_MAIN,
|
183
|
+
GUMBO_TAG_DIV,
|
184
|
+
GUMBO_TAG_A,
|
185
|
+
GUMBO_TAG_EM,
|
186
|
+
GUMBO_TAG_STRONG,
|
187
|
+
GUMBO_TAG_SMALL,
|
188
|
+
GUMBO_TAG_S,
|
189
|
+
GUMBO_TAG_CITE,
|
190
|
+
GUMBO_TAG_Q,
|
191
|
+
GUMBO_TAG_DFN,
|
192
|
+
GUMBO_TAG_ABBR,
|
193
|
+
GUMBO_TAG_DATA,
|
194
|
+
GUMBO_TAG_TIME,
|
195
|
+
GUMBO_TAG_CODE,
|
196
|
+
GUMBO_TAG_VAR,
|
197
|
+
GUMBO_TAG_SAMP,
|
198
|
+
GUMBO_TAG_KBD,
|
199
|
+
GUMBO_TAG_SUB,
|
200
|
+
GUMBO_TAG_SUP,
|
201
|
+
GUMBO_TAG_I,
|
202
|
+
GUMBO_TAG_B,
|
203
|
+
GUMBO_TAG_U,
|
204
|
+
GUMBO_TAG_MARK,
|
205
|
+
GUMBO_TAG_RUBY,
|
206
|
+
GUMBO_TAG_RT,
|
207
|
+
GUMBO_TAG_RP,
|
208
|
+
GUMBO_TAG_BDI,
|
209
|
+
GUMBO_TAG_BDO,
|
210
|
+
GUMBO_TAG_SPAN,
|
211
|
+
GUMBO_TAG_BR,
|
212
|
+
GUMBO_TAG_WBR,
|
213
|
+
GUMBO_TAG_INS,
|
214
|
+
GUMBO_TAG_DEL,
|
215
|
+
GUMBO_TAG_IMAGE,
|
216
|
+
GUMBO_TAG_IMG,
|
217
|
+
GUMBO_TAG_IFRAME,
|
218
|
+
GUMBO_TAG_EMBED,
|
219
|
+
GUMBO_TAG_OBJECT,
|
220
|
+
GUMBO_TAG_PARAM,
|
221
|
+
GUMBO_TAG_VIDEO,
|
222
|
+
GUMBO_TAG_AUDIO,
|
223
|
+
GUMBO_TAG_SOURCE,
|
224
|
+
GUMBO_TAG_TRACK,
|
225
|
+
GUMBO_TAG_CANVAS,
|
226
|
+
GUMBO_TAG_MAP,
|
227
|
+
GUMBO_TAG_AREA,
|
228
|
+
GUMBO_TAG_MATH,
|
229
|
+
GUMBO_TAG_MI,
|
230
|
+
GUMBO_TAG_MO,
|
231
|
+
GUMBO_TAG_MN,
|
232
|
+
GUMBO_TAG_MS,
|
233
|
+
GUMBO_TAG_MTEXT,
|
234
|
+
GUMBO_TAG_MGLYPH,
|
235
|
+
GUMBO_TAG_MALIGNMARK,
|
236
|
+
GUMBO_TAG_ANNOTATION_XML,
|
237
|
+
GUMBO_TAG_SVG,
|
238
|
+
GUMBO_TAG_FOREIGNOBJECT,
|
239
|
+
GUMBO_TAG_DESC,
|
240
|
+
GUMBO_TAG_TABLE,
|
241
|
+
GUMBO_TAG_CAPTION,
|
242
|
+
GUMBO_TAG_COLGROUP,
|
243
|
+
GUMBO_TAG_COL,
|
244
|
+
GUMBO_TAG_TBODY,
|
245
|
+
GUMBO_TAG_THEAD,
|
246
|
+
GUMBO_TAG_TFOOT,
|
247
|
+
GUMBO_TAG_TR,
|
248
|
+
GUMBO_TAG_TD,
|
249
|
+
GUMBO_TAG_TH,
|
250
|
+
GUMBO_TAG_FORM,
|
251
|
+
GUMBO_TAG_FIELDSET,
|
252
|
+
GUMBO_TAG_LEGEND,
|
253
|
+
GUMBO_TAG_LABEL,
|
254
|
+
GUMBO_TAG_INPUT,
|
255
|
+
GUMBO_TAG_BUTTON,
|
256
|
+
GUMBO_TAG_SELECT,
|
257
|
+
GUMBO_TAG_DATALIST,
|
258
|
+
GUMBO_TAG_OPTGROUP,
|
259
|
+
GUMBO_TAG_OPTION,
|
260
|
+
GUMBO_TAG_TEXTAREA,
|
261
|
+
GUMBO_TAG_KEYGEN,
|
262
|
+
GUMBO_TAG_OUTPUT,
|
263
|
+
GUMBO_TAG_PROGRESS,
|
264
|
+
GUMBO_TAG_METER,
|
265
|
+
GUMBO_TAG_DETAILS,
|
266
|
+
GUMBO_TAG_SUMMARY,
|
267
|
+
GUMBO_TAG_MENU,
|
268
|
+
GUMBO_TAG_MENUITEM,
|
269
|
+
GUMBO_TAG_APPLET,
|
270
|
+
GUMBO_TAG_ACRONYM,
|
271
|
+
GUMBO_TAG_BGSOUND,
|
272
|
+
GUMBO_TAG_DIR,
|
273
|
+
GUMBO_TAG_FRAME,
|
274
|
+
GUMBO_TAG_FRAMESET,
|
275
|
+
GUMBO_TAG_NOFRAMES,
|
276
|
+
GUMBO_TAG_LISTING,
|
277
|
+
GUMBO_TAG_XMP,
|
278
|
+
GUMBO_TAG_NEXTID,
|
279
|
+
GUMBO_TAG_NOEMBED,
|
280
|
+
GUMBO_TAG_PLAINTEXT,
|
281
|
+
GUMBO_TAG_RB,
|
282
|
+
GUMBO_TAG_STRIKE,
|
283
|
+
GUMBO_TAG_BASEFONT,
|
284
|
+
GUMBO_TAG_BIG,
|
285
|
+
GUMBO_TAG_BLINK,
|
286
|
+
GUMBO_TAG_CENTER,
|
287
|
+
GUMBO_TAG_FONT,
|
288
|
+
GUMBO_TAG_MARQUEE,
|
289
|
+
GUMBO_TAG_MULTICOL,
|
290
|
+
GUMBO_TAG_NOBR,
|
291
|
+
GUMBO_TAG_SPACER,
|
292
|
+
GUMBO_TAG_TT,
|
293
|
+
GUMBO_TAG_RTC,
|
294
|
+
GUMBO_TAG_DIALOG,
|
295
|
+
// Used for all tags that don't have special handling in HTML.
|
296
|
+
GUMBO_TAG_UNKNOWN,
|
297
|
+
// A marker value to indicate the end of the enum, for iterating over it.
|
298
|
+
GUMBO_TAG_LAST,
|
299
|
+
} GumboTag;
|
300
|
+
|
301
|
+
/**
|
302
|
+
* Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
|
303
|
+
* return value is static data owned by the library.
|
304
|
+
*/
|
305
|
+
const char* gumbo_normalized_tagname(GumboTag tag);
|
306
|
+
|
307
|
+
/**
|
308
|
+
* Extracts the tag name from the `original_text` field of an element
|
309
|
+
* or token by stripping off `</>` characters and attributes and
|
310
|
+
* adjusting the passed-in `GumboStringPiece` appropriately. The tag
|
311
|
+
* name is in the original case and shares a buffer with the original
|
312
|
+
* text, to simplify memory management. Behavior is undefined if a
|
313
|
+
* string piece that doesn't represent an HTML tag (`<tagname>` or
|
314
|
+
* `</tagname>`) is passed in. If the string piece is completely
|
315
|
+
* empty (`NULL` data pointer), then this function will exit
|
316
|
+
* successfully as a no-op.
|
317
|
+
*/
|
318
|
+
void gumbo_tag_from_original_text(GumboStringPiece* text);
|
319
|
+
|
320
|
+
/**
|
321
|
+
* Fixes the case of SVG elements that are not all lowercase. This is
|
322
|
+
* not done at parse time because there's no place to store a mutated
|
323
|
+
* tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
|
324
|
+
* SVG tags without special handling), while `original_tag_name` is a
|
325
|
+
* pointer into the original buffer. Instead, we provide this helper
|
326
|
+
* function that clients can use to rename SVG tags as appropriate.
|
327
|
+
* Returns the case-normalized SVG tagname if a replacement is found, or
|
328
|
+
* `NULL` if no normalization is called for. The return value is static
|
329
|
+
* data and owned by the library.
|
330
|
+
*
|
331
|
+
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
332
|
+
*/
|
333
|
+
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
334
|
+
|
335
|
+
/**
|
336
|
+
* Converts a tag name string (which may be in upper or mixed case) to a
|
337
|
+
* tag enum.
|
338
|
+
*/
|
339
|
+
GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
|
340
|
+
|
341
|
+
/**
|
342
|
+
* Attribute namespaces.
|
343
|
+
* HTML includes special handling for XLink, XML, and XMLNS namespaces
|
344
|
+
* on attributes. Everything else goes in the generic "NONE" namespace.
|
345
|
+
*/
|
346
|
+
typedef enum {
|
347
|
+
GUMBO_ATTR_NAMESPACE_NONE,
|
348
|
+
GUMBO_ATTR_NAMESPACE_XLINK,
|
349
|
+
GUMBO_ATTR_NAMESPACE_XML,
|
350
|
+
GUMBO_ATTR_NAMESPACE_XMLNS,
|
351
|
+
} GumboAttributeNamespaceEnum;
|
352
|
+
|
353
|
+
/**
|
354
|
+
* A struct representing a single attribute on a HTML tag. This is a
|
355
|
+
* name-value pair, but also includes information about source locations
|
356
|
+
* and original source text.
|
357
|
+
*/
|
358
|
+
typedef struct {
|
359
|
+
/**
|
360
|
+
* The namespace for the attribute. This will usually be
|
361
|
+
* `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
|
362
|
+
* take special values, per:
|
363
|
+
* https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
|
364
|
+
*/
|
365
|
+
GumboAttributeNamespaceEnum attr_namespace;
|
366
|
+
|
367
|
+
/**
|
368
|
+
* The name of the attribute. This is in a freshly-allocated buffer to
|
369
|
+
* deal with case-normalization and is null-terminated.
|
370
|
+
*/
|
371
|
+
const char* name;
|
372
|
+
|
373
|
+
/**
|
374
|
+
* The original text of the attribute name, as a pointer into the
|
375
|
+
* original source buffer.
|
376
|
+
*/
|
377
|
+
GumboStringPiece original_name;
|
378
|
+
|
379
|
+
/**
|
380
|
+
* The value of the attribute. This is in a freshly-allocated buffer
|
381
|
+
* to deal with unescaping and is null-terminated. It does not include
|
382
|
+
* any quotes that surround the attribute. If the attribute has no
|
383
|
+
* value (for example, `selected` on a checkbox) this will be an empty
|
384
|
+
* string.
|
385
|
+
*/
|
386
|
+
const char* value;
|
387
|
+
|
388
|
+
/**
|
389
|
+
* The original text of the value of the attribute. This points into
|
390
|
+
* the original source buffer. It includes any quotes that surround
|
391
|
+
* the attribute and you can look at `original_value.data[0]` and
|
392
|
+
* `original_value.data[original_value.length - 1]` to determine what
|
393
|
+
* the quote characters were. If the attribute has no value this will
|
394
|
+
* be a 0-length string.
|
395
|
+
*/
|
396
|
+
GumboStringPiece original_value;
|
397
|
+
|
398
|
+
/** The starting position of the attribute name. */
|
399
|
+
GumboSourcePosition name_start;
|
400
|
+
|
401
|
+
/**
|
402
|
+
* The ending position of the attribute name. This is not always derivable
|
403
|
+
* from the starting position of the value because of the possibility of
|
404
|
+
* whitespace around the `=` sign.
|
405
|
+
*/
|
406
|
+
GumboSourcePosition name_end;
|
407
|
+
|
408
|
+
/** The starting position of the attribute value. */
|
409
|
+
GumboSourcePosition value_start;
|
410
|
+
|
411
|
+
/** The ending position of the attribute value. */
|
412
|
+
GumboSourcePosition value_end;
|
413
|
+
} GumboAttribute;
|
414
|
+
|
415
|
+
/**
|
416
|
+
* Given a vector of `GumboAttribute`s, look up the one with the
|
417
|
+
* specified name and return it, or `NULL` if no such attribute exists.
|
418
|
+
* This uses a case-insensitive match, as HTML is case-insensitive.
|
419
|
+
*/
|
420
|
+
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
|
421
|
+
|
422
|
+
/**
|
423
|
+
* Enum denoting the type of node. This determines the type of the
|
424
|
+
* `node.v` union.
|
425
|
+
*/
|
426
|
+
typedef enum {
|
427
|
+
/** Document node. `v` will be a `GumboDocument`. */
|
428
|
+
GUMBO_NODE_DOCUMENT,
|
429
|
+
/** Element node. `v` will be a `GumboElement`. */
|
430
|
+
GUMBO_NODE_ELEMENT,
|
431
|
+
/** Text node. `v` will be a `GumboText`. */
|
432
|
+
GUMBO_NODE_TEXT,
|
433
|
+
/** CDATA node. `v` will be a `GumboText`. */
|
434
|
+
GUMBO_NODE_CDATA,
|
435
|
+
/** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
|
436
|
+
GUMBO_NODE_COMMENT,
|
437
|
+
/** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
|
438
|
+
GUMBO_NODE_WHITESPACE,
|
439
|
+
/**
|
440
|
+
* Template node. This is separate from `GUMBO_NODE_ELEMENT` because
|
441
|
+
* many client libraries will want to ignore the contents of template
|
442
|
+
* nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
|
443
|
+
* do the right thing here, while clients that want to include template
|
444
|
+
* contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
|
445
|
+
* `GumboElement`.
|
446
|
+
*/
|
447
|
+
GUMBO_NODE_TEMPLATE
|
448
|
+
} GumboNodeType;
|
449
|
+
|
450
|
+
/**
|
451
|
+
* Forward declaration of GumboNode so it can be used recursively in
|
452
|
+
* GumboNode.parent.
|
453
|
+
*/
|
454
|
+
typedef struct GumboInternalNode GumboNode;
|
455
|
+
|
456
|
+
/** https://dom.spec.whatwg.org/#concept-document-quirks */
|
457
|
+
typedef enum {
|
458
|
+
GUMBO_DOCTYPE_NO_QUIRKS,
|
459
|
+
GUMBO_DOCTYPE_QUIRKS,
|
460
|
+
GUMBO_DOCTYPE_LIMITED_QUIRKS
|
461
|
+
} GumboQuirksModeEnum;
|
462
|
+
|
463
|
+
/**
|
464
|
+
* Namespaces.
|
465
|
+
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
|
466
|
+
* Rather, anything inside an `<svg>` tag is in the SVG namespace,
|
467
|
+
* anything inside the `<math>` tag is in the MathML namespace, and
|
468
|
+
* anything else is inside the HTML namespace. No other namespaces are
|
469
|
+
* supported, so this can be an `enum`.
|
470
|
+
*/
|
471
|
+
typedef enum {
|
472
|
+
GUMBO_NAMESPACE_HTML,
|
473
|
+
GUMBO_NAMESPACE_SVG,
|
474
|
+
GUMBO_NAMESPACE_MATHML
|
475
|
+
} GumboNamespaceEnum;
|
476
|
+
|
477
|
+
/**
|
478
|
+
* Parse flags.
|
479
|
+
* We track the reasons for parser insertion of nodes and store them in
|
480
|
+
* a bitvector in the node itself. This lets client code optimize out
|
481
|
+
* nodes that are implied by the HTML structure of the document, or flag
|
482
|
+
* constructs that may not be allowed by a style guide, or track the
|
483
|
+
* prevalence of incorrect or tricky HTML code.
|
484
|
+
*/
|
485
|
+
typedef enum {
|
486
|
+
/**
|
487
|
+
* A normal node -- both start and end tags appear in the source,
|
488
|
+
* nothing has been reparented.
|
489
|
+
*/
|
490
|
+
GUMBO_INSERTION_NORMAL = 0,
|
491
|
+
|
492
|
+
/**
|
493
|
+
* A node inserted by the parser to fulfill some implicit insertion
|
494
|
+
* rule. This is usually set in addition to some other flag giving a
|
495
|
+
* more specific insertion reason; it's a generic catch-all term
|
496
|
+
* meaning "The start tag for this node did not appear in the document
|
497
|
+
* source".
|
498
|
+
*/
|
499
|
+
GUMBO_INSERTION_BY_PARSER = 1 << 0,
|
500
|
+
|
501
|
+
/**
|
502
|
+
* A flag indicating that the end tag for this node did not appear in
|
503
|
+
* the document source. Note that in some cases, you can still have
|
504
|
+
* parser-inserted nodes with an explicit end tag. For example,
|
505
|
+
* `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
|
506
|
+
* node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
|
507
|
+
* `</html>` tag actually exists.
|
508
|
+
*
|
509
|
+
* This flag will be set only if the end tag is completely missing.
|
510
|
+
* In some cases, the end tag may be misplaced (e.g. a `</body>` tag
|
511
|
+
* with text afterwards), which will leave this flag unset and require
|
512
|
+
* clients to inspect the parse errors for that case.
|
513
|
+
*/
|
514
|
+
GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
|
515
|
+
|
516
|
+
// Value 1 << 2 was for a flag that has since been removed.
|
517
|
+
|
518
|
+
/**
|
519
|
+
* A flag for nodes that are inserted because their presence is
|
520
|
+
* implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
|
521
|
+
* `<tbody>`, etc.
|
522
|
+
*/
|
523
|
+
GUMBO_INSERTION_IMPLIED = 1 << 3,
|
524
|
+
|
525
|
+
/**
|
526
|
+
* A flag for nodes that are converted from their end tag equivalents.
|
527
|
+
* For example, `</p>` when no paragraph is open implies that the
|
528
|
+
* parser should create a `<p>` tag and immediately close it, while
|
529
|
+
* `</br>` means the same thing as `<br>`.
|
530
|
+
*/
|
531
|
+
GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
|
532
|
+
|
533
|
+
// Value 1 << 5 was for a flag that has since been removed.
|
534
|
+
|
535
|
+
/** A flag for `<image>` tags that are rewritten as `<img>`. */
|
536
|
+
GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
|
537
|
+
|
538
|
+
/**
|
539
|
+
* A flag for nodes that are cloned as a result of the reconstruction
|
540
|
+
* of active formatting elements. This is set only on the clone; the
|
541
|
+
* initial portion of the formatting run is a NORMAL node with an
|
542
|
+
* `IMPLICIT_END_TAG`.
|
543
|
+
*/
|
544
|
+
GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
|
545
|
+
|
546
|
+
/** A flag for nodes that are cloned by the adoption agency algorithm. */
|
547
|
+
GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
|
548
|
+
|
549
|
+
/** A flag for nodes that are moved by the adoption agency algorithm. */
|
550
|
+
GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
|
551
|
+
|
552
|
+
/**
|
553
|
+
* A flag for nodes that have been foster-parented out of a table (or
|
554
|
+
* should've been foster-parented, if verbatim mode is set).
|
555
|
+
*/
|
556
|
+
GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
|
557
|
+
} GumboParseFlags;
|
558
|
+
|
559
|
+
/** Information specific to document nodes. */
|
560
|
+
typedef struct {
|
561
|
+
/**
|
562
|
+
* An array of `GumboNode`s, containing the children of this element.
|
563
|
+
* This will normally consist of the `<html>` element and any comment
|
564
|
+
* nodes found. Pointers are owned.
|
565
|
+
*/
|
566
|
+
GumboVector /* GumboNode* */ children;
|
567
|
+
|
568
|
+
/**
|
569
|
+
* `true` if there was an explicit doctype token, as opposed to it
|
570
|
+
* being omitted.
|
571
|
+
*/
|
572
|
+
bool has_doctype;
|
573
|
+
|
574
|
+
// Fields from the doctype token, copied verbatim.
|
575
|
+
const char* name;
|
576
|
+
const char* public_identifier;
|
577
|
+
const char* system_identifier;
|
578
|
+
|
579
|
+
/**
|
580
|
+
* Whether or not the document is in QuirksMode, as determined by the
|
581
|
+
* values in the GumboTokenDocType template.
|
582
|
+
*/
|
583
|
+
GumboQuirksModeEnum doc_type_quirks_mode;
|
584
|
+
} GumboDocument;
|
585
|
+
|
586
|
+
/**
|
587
|
+
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
|
588
|
+
* elements. This contains just a block of text and its position.
|
589
|
+
*/
|
590
|
+
typedef struct {
|
591
|
+
/**
|
592
|
+
* The text of this node, after entities have been parsed and decoded.
|
593
|
+
* For comment and cdata nodes, this does not include the comment
|
594
|
+
* delimiters.
|
595
|
+
*/
|
596
|
+
const char* text;
|
597
|
+
|
598
|
+
/**
|
599
|
+
* The original text of this node, as a pointer into the original
|
600
|
+
* buffer. For comment/cdata nodes, this includes the comment
|
601
|
+
* delimiters.
|
602
|
+
*/
|
603
|
+
GumboStringPiece original_text;
|
604
|
+
|
605
|
+
/**
|
606
|
+
* The starting position of this node. This corresponds to the
|
607
|
+
* position of `original_text`, before entities are decoded.
|
608
|
+
* */
|
609
|
+
GumboSourcePosition start_pos;
|
610
|
+
} GumboText;
|
611
|
+
|
612
|
+
/**
|
613
|
+
* The struct used to represent all HTML elements. This contains
|
614
|
+
* information about the tag, attributes, and child nodes.
|
615
|
+
*/
|
616
|
+
typedef struct {
|
617
|
+
/**
|
618
|
+
* An array of `GumboNode`s, containing the children of this element.
|
619
|
+
* Pointers are owned.
|
620
|
+
*/
|
621
|
+
GumboVector /* GumboNode* */ children;
|
622
|
+
|
623
|
+
/** The GumboTag enum for this element. */
|
624
|
+
GumboTag tag;
|
625
|
+
|
626
|
+
/** The name for this element. */
|
627
|
+
const char* name;
|
628
|
+
|
629
|
+
/** The GumboNamespaceEnum for this element. */
|
630
|
+
GumboNamespaceEnum tag_namespace;
|
631
|
+
|
632
|
+
/**
|
633
|
+
* A `GumboStringPiece` pointing to the original tag text for this
|
634
|
+
* element, pointing directly into the source buffer. If the tag was
|
635
|
+
* inserted algorithmically (for example, `<head>` or `<tbody>`
|
636
|
+
* insertion), this will be a zero-length string.
|
637
|
+
*/
|
638
|
+
GumboStringPiece original_tag;
|
639
|
+
|
640
|
+
/**
|
641
|
+
* A `GumboStringPiece` pointing to the original end tag text for this
|
642
|
+
* element. If the end tag was inserted algorithmically, (for example,
|
643
|
+
* closing a self-closing tag), this will be a zero-length string.
|
644
|
+
*/
|
645
|
+
GumboStringPiece original_end_tag;
|
646
|
+
|
647
|
+
/** The source position for the start of the start tag. */
|
648
|
+
GumboSourcePosition start_pos;
|
649
|
+
|
650
|
+
/** The source position for the start of the end tag. */
|
651
|
+
GumboSourcePosition end_pos;
|
652
|
+
|
653
|
+
/**
|
654
|
+
* An array of `GumboAttribute`s, containing the attributes for this
|
655
|
+
* tag in the order that they were parsed. Pointers are owned.
|
656
|
+
*/
|
657
|
+
GumboVector /* GumboAttribute* */ attributes;
|
658
|
+
} GumboElement;
|
659
|
+
|
660
|
+
/**
|
661
|
+
* A supertype for `GumboElement` and `GumboText`, so that we can
|
662
|
+
* include one generic type in lists of children and cast as necessary
|
663
|
+
* to subtypes.
|
664
|
+
*/
|
665
|
+
struct GumboInternalNode {
|
666
|
+
/** The type of node that this is. */
|
667
|
+
GumboNodeType type;
|
668
|
+
|
669
|
+
/** Pointer back to parent node. Not owned. */
|
670
|
+
GumboNode* parent;
|
671
|
+
|
672
|
+
/** The index within the parent's children vector of this node. */
|
673
|
+
unsigned int index_within_parent;
|
674
|
+
|
675
|
+
/**
|
676
|
+
* A bitvector of flags containing information about why this element
|
677
|
+
* was inserted into the parse tree, including a variety of special
|
678
|
+
* parse situations.
|
679
|
+
*/
|
680
|
+
GumboParseFlags parse_flags;
|
681
|
+
|
682
|
+
/** The actual node data. */
|
683
|
+
union {
|
684
|
+
GumboDocument document; // For GUMBO_NODE_DOCUMENT.
|
685
|
+
GumboElement element; // For GUMBO_NODE_ELEMENT.
|
686
|
+
GumboText text; // For everything else.
|
687
|
+
} v;
|
688
|
+
};
|
689
|
+
|
690
|
+
/**
|
691
|
+
* Input struct containing configuration options for the parser.
|
692
|
+
* These let you specify alternate memory managers, provide different
|
693
|
+
* error handling, etc. Use `kGumboDefaultOptions` for sensible
|
694
|
+
* defaults and only set what you need.
|
695
|
+
*/
|
696
|
+
typedef struct GumboInternalOptions {
|
697
|
+
/**
|
698
|
+
* The tab-stop size, for computing positions in HTML files that
|
699
|
+
* use tabs. Default: `8`.
|
700
|
+
*/
|
701
|
+
int tab_stop;
|
702
|
+
|
703
|
+
/**
|
704
|
+
* Whether or not to stop parsing when the first error is encountered.
|
705
|
+
* Default: `false`.
|
706
|
+
*/
|
707
|
+
bool stop_on_first_error;
|
708
|
+
|
709
|
+
/**
|
710
|
+
* Maximum allowed number of attributes per element. If this limit is
|
711
|
+
* exceeded, the parser will return early with a partial document and
|
712
|
+
* the returned `GumboOutput` will have its `status` field set to
|
713
|
+
* `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
|
714
|
+
* Default: `400`.
|
715
|
+
*/
|
716
|
+
int max_attributes;
|
717
|
+
|
718
|
+
/**
|
719
|
+
* Maximum allowed depth for the parse tree. If this limit is exceeded,
|
720
|
+
* the parser will return early with a partial document and the returned
|
721
|
+
* `GumboOutput` will have its `status` field set to
|
722
|
+
* `GUMBO_STATUS_TREE_TOO_DEEP`.
|
723
|
+
* Default: `400`.
|
724
|
+
*/
|
725
|
+
unsigned int max_tree_depth;
|
726
|
+
|
727
|
+
/**
|
728
|
+
* The maximum number of errors before the parser stops recording
|
729
|
+
* them. This is provided so that if the page is totally borked, we
|
730
|
+
* don't completely fill up the errors vector and exhaust memory with
|
731
|
+
* useless redundant errors. Set to `-1` to disable the limit.
|
732
|
+
* Default: `-1`.
|
733
|
+
*/
|
734
|
+
int max_errors;
|
735
|
+
|
736
|
+
/**
|
737
|
+
* The fragment context for parsing:
|
738
|
+
* https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
|
739
|
+
*
|
740
|
+
* If `NULL` is passed here, it is assumed to be "no
|
741
|
+
* fragment", i.e. the regular parsing algorithm. Otherwise, pass the
|
742
|
+
* tag name for the intended parent of the parsed fragment. We use the
|
743
|
+
* tag name, namespace, and encoding attribute which are sufficient to
|
744
|
+
* set all of the parsing context needed for fragment parsing.
|
745
|
+
*
|
746
|
+
* Default: `NULL`.
|
747
|
+
*/
|
748
|
+
const char* fragment_context;
|
749
|
+
|
750
|
+
/**
|
751
|
+
* The namespace for the fragment context. This lets client code
|
752
|
+
* differentiate between, say, parsing a `<title>` tag in SVG vs.
|
753
|
+
* parsing it in HTML.
|
754
|
+
*
|
755
|
+
* Default: `GUMBO_NAMESPACE_HTML`.
|
756
|
+
*/
|
757
|
+
GumboNamespaceEnum fragment_namespace;
|
758
|
+
|
759
|
+
/**
|
760
|
+
* The value of the fragment context's `encoding` attribute, if any.
|
761
|
+
* Set to `NULL` for no `encoding` attribute.
|
762
|
+
*
|
763
|
+
* Default: `NULL`.
|
764
|
+
*/
|
765
|
+
const char* fragment_encoding;
|
766
|
+
|
767
|
+
/**
|
768
|
+
* Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
|
769
|
+
* be looked up using `gumbo_compute_quirks_mode()`.
|
770
|
+
*
|
771
|
+
* Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
|
772
|
+
*/
|
773
|
+
GumboQuirksModeEnum quirks_mode;
|
774
|
+
|
775
|
+
/**
|
776
|
+
* For fragment parsing. Set this to true if the context node has a form
|
777
|
+
* element as an ancestor.
|
778
|
+
*
|
779
|
+
* Default: `false`.
|
780
|
+
*/
|
781
|
+
bool fragment_context_has_form_ancestor;
|
782
|
+
} GumboOptions;
|
783
|
+
|
784
|
+
/** Default options struct; use this with gumbo_parse_with_options. */
|
785
|
+
extern const GumboOptions kGumboDefaultOptions;
|
786
|
+
|
787
|
+
/**
|
788
|
+
* Status code indicating whether parsing finished successfully or
|
789
|
+
* was stopped mid-document due to exceptional circumstances.
|
790
|
+
*/
|
791
|
+
typedef enum {
|
792
|
+
/**
|
793
|
+
* Indicates that parsing completed successfuly. The resulting tree
|
794
|
+
* will be a complete document.
|
795
|
+
*/
|
796
|
+
GUMBO_STATUS_OK,
|
797
|
+
|
798
|
+
/**
|
799
|
+
* Indicates that the maximum element nesting limit
|
800
|
+
* (`GumboOptions::max_tree_depth`) was reached during parsing. The
|
801
|
+
* resulting tree will be a partial document, with no further nodes
|
802
|
+
* created after the point where the limit was reached. The partial
|
803
|
+
* document may be useful for constructing an error message but
|
804
|
+
* typically shouldn't be used for other purposes.
|
805
|
+
*/
|
806
|
+
GUMBO_STATUS_TREE_TOO_DEEP,
|
807
|
+
|
808
|
+
/**
|
809
|
+
* Indicates that the maximum number of attributes per element
|
810
|
+
* (`GumboOptions::max_attributes`) was reached during parsing. The
|
811
|
+
* resulting tree will be a partial document, with no further nodes
|
812
|
+
* created after the point where the limit was reached. The partial
|
813
|
+
* document may be useful for constructing an error message but
|
814
|
+
* typically shouldn't be used for other purposes.
|
815
|
+
*/
|
816
|
+
GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
|
817
|
+
|
818
|
+
// Currently unused
|
819
|
+
GUMBO_STATUS_OUT_OF_MEMORY,
|
820
|
+
} GumboOutputStatus;
|
821
|
+
|
822
|
+
|
823
|
+
/** The output struct containing the results of the parse. */
|
824
|
+
typedef struct GumboInternalOutput {
|
825
|
+
/**
|
826
|
+
* Pointer to the document node. This is a `GumboNode` of type
|
827
|
+
* `NODE_DOCUMENT` that contains the entire document as its child.
|
828
|
+
*/
|
829
|
+
GumboNode* document;
|
830
|
+
|
831
|
+
/**
|
832
|
+
* Pointer to the root node. This is the `<html>` tag that forms the
|
833
|
+
* root of the document.
|
834
|
+
*/
|
835
|
+
GumboNode* root;
|
836
|
+
|
837
|
+
/**
|
838
|
+
* A list of errors that occurred during the parse.
|
839
|
+
*/
|
840
|
+
GumboVector /* GumboError */ errors;
|
841
|
+
|
842
|
+
/**
|
843
|
+
* True if the parser encounted an error.
|
844
|
+
*
|
845
|
+
* This can be true and `errors` an empty `GumboVector` if the `max_errors`
|
846
|
+
* option was set to 0.
|
847
|
+
*/
|
848
|
+
bool document_error;
|
849
|
+
|
850
|
+
/**
|
851
|
+
* A status code indicating whether parsing finished successfully or was
|
852
|
+
* stopped mid-document due to exceptional circumstances.
|
853
|
+
*/
|
854
|
+
GumboOutputStatus status;
|
855
|
+
} GumboOutput;
|
856
|
+
|
857
|
+
/**
|
858
|
+
* Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
|
859
|
+
* buffer must live at least as long as the parse tree, as some fields
|
860
|
+
* (eg. `original_text`) point directly into the original buffer.
|
861
|
+
*
|
862
|
+
* This doesn't support buffers longer than 4 gigabytes.
|
863
|
+
*/
|
864
|
+
GumboOutput* gumbo_parse(const char* buffer);
|
865
|
+
|
866
|
+
/**
|
867
|
+
* Extended version of `gumbo_parse` that takes an explicit options
|
868
|
+
* structure, buffer, and length.
|
869
|
+
*/
|
870
|
+
GumboOutput* gumbo_parse_with_options (
|
871
|
+
const GumboOptions* options,
|
872
|
+
const char* buffer,
|
873
|
+
size_t buffer_length
|
874
|
+
);
|
875
|
+
|
876
|
+
/**
|
877
|
+
* Compute the quirks mode based on the name, public identifier, and system
|
878
|
+
* identifier. Any of these may be `NULL` to indicate a missing value.
|
879
|
+
*/
|
880
|
+
GumboQuirksModeEnum gumbo_compute_quirks_mode (
|
881
|
+
const char *name,
|
882
|
+
const char *pubid,
|
883
|
+
const char *sysid
|
884
|
+
);
|
885
|
+
|
886
|
+
/** Convert a `GumboOutputStatus` code into a readable description. */
|
887
|
+
const char* gumbo_status_to_string(GumboOutputStatus status);
|
888
|
+
|
889
|
+
/** Release the memory used for the parse tree and parse errors. */
|
890
|
+
void gumbo_destroy_output(GumboOutput* output);
|
891
|
+
|
892
|
+
/** Opaque GumboError type */
|
893
|
+
typedef struct GumboInternalError GumboError;
|
894
|
+
|
895
|
+
/**
|
896
|
+
* Returns the position of the error.
|
897
|
+
*/
|
898
|
+
GumboSourcePosition gumbo_error_position(const GumboError* error);
|
899
|
+
|
900
|
+
/**
|
901
|
+
* Returns a constant string representation of the error's code. This is owned
|
902
|
+
* by the library and should not be freed by the caller.
|
903
|
+
*/
|
904
|
+
const char* gumbo_error_code(const GumboError* error);
|
905
|
+
|
906
|
+
/**
|
907
|
+
* Prints an error to a string. This stores a freshly-allocated buffer
|
908
|
+
* containing the error message text in output. The caller is responsible for
|
909
|
+
* freeing the buffer. The size of the error message is returned. The error
|
910
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
911
|
+
* returned size must be used.
|
912
|
+
*/
|
913
|
+
size_t gumbo_error_to_string(const GumboError* error, char **output);
|
914
|
+
|
915
|
+
/**
|
916
|
+
* Prints a caret diagnostic to a string. This stores a freshly-allocated
|
917
|
+
* buffer containing the error message text in output. The caller is responsible for
|
918
|
+
* freeing the buffer. The size of the error message is returned. The error
|
919
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
920
|
+
* returned size must be used.
|
921
|
+
*/
|
922
|
+
size_t gumbo_caret_diagnostic_to_string (
|
923
|
+
const GumboError* error,
|
924
|
+
const char* source_text,
|
925
|
+
size_t source_length,
|
926
|
+
char** output
|
927
|
+
);
|
928
|
+
|
929
|
+
/**
|
930
|
+
* Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
|
931
|
+
* instead of writing to a string.
|
932
|
+
*/
|
933
|
+
void gumbo_print_caret_diagnostic (
|
934
|
+
const GumboError* error,
|
935
|
+
const char* source_text,
|
936
|
+
size_t source_length
|
937
|
+
);
|
938
|
+
|
939
|
+
#ifdef __cplusplus
|
940
|
+
}
|
941
|
+
#endif
|
942
|
+
|
943
|
+
#endif // GUMBO_H
|