Nokogiri_precompiled_aarch64_dedshit 1.14.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +44 -0
- data/LICENSE-DEPENDENCIES.md +2224 -0
- data/LICENSE.md +9 -0
- data/README.md +287 -0
- data/bin/nokogiri +131 -0
- data/dependencies.yml +41 -0
- data/ext/java/nokogiri/Html4Document.java +157 -0
- data/ext/java/nokogiri/Html4ElementDescription.java +133 -0
- data/ext/java/nokogiri/Html4EntityLookup.java +63 -0
- data/ext/java/nokogiri/Html4SaxParserContext.java +289 -0
- data/ext/java/nokogiri/Html4SaxPushParser.java +213 -0
- data/ext/java/nokogiri/NokogiriService.java +613 -0
- data/ext/java/nokogiri/XmlAttr.java +154 -0
- data/ext/java/nokogiri/XmlAttributeDecl.java +119 -0
- data/ext/java/nokogiri/XmlCdata.java +60 -0
- data/ext/java/nokogiri/XmlComment.java +77 -0
- data/ext/java/nokogiri/XmlDocument.java +705 -0
- data/ext/java/nokogiri/XmlDocumentFragment.java +163 -0
- data/ext/java/nokogiri/XmlDtd.java +516 -0
- data/ext/java/nokogiri/XmlElement.java +44 -0
- data/ext/java/nokogiri/XmlElementContent.java +412 -0
- data/ext/java/nokogiri/XmlElementDecl.java +148 -0
- data/ext/java/nokogiri/XmlEntityDecl.java +151 -0
- data/ext/java/nokogiri/XmlEntityReference.java +79 -0
- data/ext/java/nokogiri/XmlNamespace.java +193 -0
- data/ext/java/nokogiri/XmlNode.java +1938 -0
- data/ext/java/nokogiri/XmlNodeSet.java +463 -0
- data/ext/java/nokogiri/XmlProcessingInstruction.java +79 -0
- data/ext/java/nokogiri/XmlReader.java +615 -0
- data/ext/java/nokogiri/XmlRelaxng.java +133 -0
- data/ext/java/nokogiri/XmlSaxParserContext.java +329 -0
- data/ext/java/nokogiri/XmlSaxPushParser.java +288 -0
- data/ext/java/nokogiri/XmlSchema.java +423 -0
- data/ext/java/nokogiri/XmlSyntaxError.java +137 -0
- data/ext/java/nokogiri/XmlText.java +90 -0
- data/ext/java/nokogiri/XmlXpathContext.java +305 -0
- data/ext/java/nokogiri/XsltStylesheet.java +368 -0
- data/ext/java/nokogiri/internals/ClosedStreamException.java +13 -0
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +252 -0
- data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +27 -0
- data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +178 -0
- data/ext/java/nokogiri/internals/NokogiriDomParser.java +99 -0
- data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +140 -0
- data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +65 -0
- data/ext/java/nokogiri/internals/NokogiriHandler.java +339 -0
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +817 -0
- data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +228 -0
- data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +110 -0
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +86 -0
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +107 -0
- data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +62 -0
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +165 -0
- data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +50 -0
- data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +37 -0
- data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +70 -0
- data/ext/java/nokogiri/internals/ParserContext.java +262 -0
- data/ext/java/nokogiri/internals/ReaderNode.java +564 -0
- data/ext/java/nokogiri/internals/SaveContextVisitor.java +865 -0
- data/ext/java/nokogiri/internals/SchemaErrorHandler.java +50 -0
- data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +174 -0
- data/ext/java/nokogiri/internals/XmlDeclHandler.java +11 -0
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +265 -0
- data/ext/java/nokogiri/internals/XmlSaxParser.java +40 -0
- data/ext/java/nokogiri/internals/c14n/AttrCompare.java +122 -0
- data/ext/java/nokogiri/internals/c14n/C14nHelper.java +178 -0
- data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +43 -0
- data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +106 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +278 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +664 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +45 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +45 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +388 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +308 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +47 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +51 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +51 -0
- data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +50 -0
- data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +660 -0
- data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +194 -0
- data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +77 -0
- data/ext/java/nokogiri/internals/c14n/Constants.java +45 -0
- data/ext/java/nokogiri/internals/c14n/ElementProxy.java +325 -0
- data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +106 -0
- data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +86 -0
- data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +181 -0
- data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +87 -0
- data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +452 -0
- data/ext/java/nokogiri/internals/c14n/NodeFilter.java +52 -0
- data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +190 -0
- data/ext/java/nokogiri/internals/c14n/XMLUtils.java +540 -0
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1712 -0
- data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +737 -0
- data/ext/nokogiri/depend +38 -0
- data/ext/nokogiri/extconf.rb +1086 -0
- data/ext/nokogiri/gumbo.c +594 -0
- data/ext/nokogiri/html4_document.c +167 -0
- data/ext/nokogiri/html4_element_description.c +294 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser_context.c +116 -0
- data/ext/nokogiri/html4_sax_push_parser.c +95 -0
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +265 -0
- data/ext/nokogiri/nokogiri.h +235 -0
- data/ext/nokogiri/test_global_handlers.c +42 -0
- data/ext/nokogiri/xml_attr.c +103 -0
- data/ext/nokogiri/xml_attribute_decl.c +70 -0
- data/ext/nokogiri/xml_cdata.c +57 -0
- data/ext/nokogiri/xml_comment.c +62 -0
- data/ext/nokogiri/xml_document.c +689 -0
- data/ext/nokogiri/xml_document_fragment.c +44 -0
- data/ext/nokogiri/xml_dtd.c +210 -0
- data/ext/nokogiri/xml_element_content.c +128 -0
- data/ext/nokogiri/xml_element_decl.c +69 -0
- data/ext/nokogiri/xml_encoding_handler.c +104 -0
- data/ext/nokogiri/xml_entity_decl.c +112 -0
- data/ext/nokogiri/xml_entity_reference.c +50 -0
- data/ext/nokogiri/xml_namespace.c +186 -0
- data/ext/nokogiri/xml_node.c +2426 -0
- data/ext/nokogiri/xml_node_set.c +496 -0
- data/ext/nokogiri/xml_processing_instruction.c +54 -0
- data/ext/nokogiri/xml_reader.c +794 -0
- data/ext/nokogiri/xml_relax_ng.c +164 -0
- data/ext/nokogiri/xml_sax_parser.c +316 -0
- data/ext/nokogiri/xml_sax_parser_context.c +283 -0
- data/ext/nokogiri/xml_sax_push_parser.c +166 -0
- data/ext/nokogiri/xml_schema.c +260 -0
- data/ext/nokogiri/xml_syntax_error.c +85 -0
- data/ext/nokogiri/xml_text.c +48 -0
- data/ext/nokogiri/xml_xpath_context.c +415 -0
- data/ext/nokogiri/xslt_stylesheet.c +363 -0
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +111 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +626 -0
- data/gumbo-parser/src/error.h +148 -0
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
- data/gumbo-parser/src/parser.c +4878 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +223 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +170 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3463 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +66 -0
- data/gumbo-parser/src/util.h +34 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +54 -0
- data/lib/nokogiri/css/parser.rb +770 -0
- data/lib/nokogiri/css/parser.y +277 -0
- data/lib/nokogiri/css/parser_extras.rb +96 -0
- data/lib/nokogiri/css/syntax_error.rb +9 -0
- data/lib/nokogiri/css/tokenizer.rb +155 -0
- data/lib/nokogiri/css/tokenizer.rex +56 -0
- data/lib/nokogiri/css/xpath_visitor.rb +359 -0
- data/lib/nokogiri/css.rb +66 -0
- data/lib/nokogiri/decorators/slop.rb +44 -0
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +32 -0
- data/lib/nokogiri/gumbo.rb +15 -0
- data/lib/nokogiri/html.rb +48 -0
- data/lib/nokogiri/html4/builder.rb +37 -0
- data/lib/nokogiri/html4/document.rb +214 -0
- data/lib/nokogiri/html4/document_fragment.rb +54 -0
- data/lib/nokogiri/html4/element_description.rb +25 -0
- data/lib/nokogiri/html4/element_description_defaults.rb +572 -0
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4/entity_lookup.rb +15 -0
- data/lib/nokogiri/html4/sax/parser.rb +63 -0
- data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
- data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
- data/lib/nokogiri/html4.rb +47 -0
- data/lib/nokogiri/html5/document.rb +168 -0
- data/lib/nokogiri/html5/document_fragment.rb +90 -0
- data/lib/nokogiri/html5/node.rb +98 -0
- data/lib/nokogiri/html5.rb +389 -0
- data/lib/nokogiri/jruby/dependencies.rb +3 -0
- data/lib/nokogiri/jruby/isorelax/isorelax/20030108/isorelax-20030108.jar +0 -0
- data/lib/nokogiri/jruby/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar +0 -0
- data/lib/nokogiri/jruby/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar +0 -0
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/jruby/nu/validator/jing/20200702VNU/jing-20200702VNU.jar +0 -0
- data/lib/nokogiri/jruby/org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar +0 -0
- data/lib/nokogiri/jruby/xalan/serializer/2.7.3/serializer-2.7.3.jar +0 -0
- data/lib/nokogiri/jruby/xalan/xalan/2.7.3/xalan-2.7.3.jar +0 -0
- data/lib/nokogiri/jruby/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar +0 -0
- data/lib/nokogiri/jruby/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar +0 -0
- data/lib/nokogiri/syntax_error.rb +6 -0
- data/lib/nokogiri/version/constant.rb +6 -0
- data/lib/nokogiri/version/info.rb +223 -0
- data/lib/nokogiri/version.rb +4 -0
- data/lib/nokogiri/xml/attr.rb +66 -0
- data/lib/nokogiri/xml/attribute_decl.rb +20 -0
- data/lib/nokogiri/xml/builder.rb +487 -0
- data/lib/nokogiri/xml/cdata.rb +13 -0
- data/lib/nokogiri/xml/character_data.rb +9 -0
- data/lib/nokogiri/xml/document.rb +471 -0
- data/lib/nokogiri/xml/document_fragment.rb +205 -0
- data/lib/nokogiri/xml/dtd.rb +34 -0
- data/lib/nokogiri/xml/element_content.rb +38 -0
- data/lib/nokogiri/xml/element_decl.rb +15 -0
- data/lib/nokogiri/xml/entity_decl.rb +21 -0
- data/lib/nokogiri/xml/entity_reference.rb +20 -0
- data/lib/nokogiri/xml/namespace.rb +58 -0
- data/lib/nokogiri/xml/node/save_options.rb +68 -0
- data/lib/nokogiri/xml/node.rb +1563 -0
- data/lib/nokogiri/xml/node_set.rb +447 -0
- data/lib/nokogiri/xml/notation.rb +19 -0
- data/lib/nokogiri/xml/parse_options.rb +213 -0
- data/lib/nokogiri/xml/pp/character_data.rb +21 -0
- data/lib/nokogiri/xml/pp/node.rb +57 -0
- data/lib/nokogiri/xml/pp.rb +4 -0
- data/lib/nokogiri/xml/processing_instruction.rb +11 -0
- data/lib/nokogiri/xml/reader.rb +105 -0
- data/lib/nokogiri/xml/relax_ng.rb +38 -0
- data/lib/nokogiri/xml/sax/document.rb +167 -0
- data/lib/nokogiri/xml/sax/parser.rb +125 -0
- data/lib/nokogiri/xml/sax/parser_context.rb +21 -0
- data/lib/nokogiri/xml/sax/push_parser.rb +61 -0
- data/lib/nokogiri/xml/sax.rb +6 -0
- data/lib/nokogiri/xml/schema.rb +73 -0
- data/lib/nokogiri/xml/searchable.rb +270 -0
- data/lib/nokogiri/xml/syntax_error.rb +72 -0
- data/lib/nokogiri/xml/text.rb +11 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
- data/lib/nokogiri/xml/xpath.rb +21 -0
- data/lib/nokogiri/xml/xpath_context.rb +16 -0
- data/lib/nokogiri/xml.rb +76 -0
- data/lib/nokogiri/xslt/stylesheet.rb +27 -0
- data/lib/nokogiri/xslt.rb +65 -0
- data/lib/nokogiri.rb +120 -0
- data/lib/xsd/xmlparser/nokogiri.rb +106 -0
- metadata +391 -0
@@ -0,0 +1,944 @@
|
|
1
|
+
// Copyright 2010 Google Inc.
|
2
|
+
// Copyright 2018 Craig Barnes.
|
3
|
+
// Licensed under the Apache License, version 2.0.
|
4
|
+
|
5
|
+
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions,
|
6
|
+
// GUMBO_ as a prefix for enum constants and kGumbo as a prefix for
|
7
|
+
// static constants
|
8
|
+
|
9
|
+
/**
|
10
|
+
* @file
|
11
|
+
* @mainpage Gumbo HTML Parser
|
12
|
+
*
|
13
|
+
* This provides a conformant, no-dependencies implementation of the
|
14
|
+
* [HTML5] parsing algorithm. It supports only UTF-8 -- if you need
|
15
|
+
* to parse a different encoding, run a preprocessing step to convert
|
16
|
+
* to UTF-8. It returns a parse tree made of the structs in this file.
|
17
|
+
*
|
18
|
+
* Example:
|
19
|
+
* @code
|
20
|
+
* GumboOutput* output = gumbo_parse(input);
|
21
|
+
* do_something_with_doctype(output->document);
|
22
|
+
* do_something_with_html_tree(output->root);
|
23
|
+
* gumbo_destroy_output(output);
|
24
|
+
* @endcode
|
25
|
+
*
|
26
|
+
* [HTML5]: https://html.spec.whatwg.org/multipage/
|
27
|
+
*/
|
28
|
+
|
29
|
+
#ifndef GUMBO_H
|
30
|
+
#define GUMBO_H
|
31
|
+
|
32
|
+
#include <stdbool.h>
|
33
|
+
#include <stddef.h>
|
34
|
+
|
35
|
+
#ifdef __cplusplus
|
36
|
+
extern "C" {
|
37
|
+
#endif
|
38
|
+
|
39
|
+
/**
|
40
|
+
* A struct representing a character position within the original text
|
41
|
+
* buffer. Line and column numbers are 1-based and offsets are 0-based,
|
42
|
+
* which matches how most editors and command-line tools work.
|
43
|
+
*/
|
44
|
+
typedef struct {
|
45
|
+
size_t line;
|
46
|
+
size_t column;
|
47
|
+
size_t offset;
|
48
|
+
} GumboSourcePosition;
|
49
|
+
|
50
|
+
/**
|
51
|
+
* A struct representing a string or part of a string. Strings within
|
52
|
+
* the parser are represented by a `char*` and a length; the `char*`
|
53
|
+
* points into an existing data buffer owned by some other code (often
|
54
|
+
* the original input). `GumboStringPiece`s are assumed (by convention)
|
55
|
+
* to be immutable, because they may share data. Clients should assume
|
56
|
+
* that it is not NUL-terminated and should always use explicit lengths
|
57
|
+
* when manipulating them.
|
58
|
+
*/
|
59
|
+
typedef struct {
|
60
|
+
/** A pointer to the beginning of the string. `NULL` if `length == 0`. */
|
61
|
+
const char* data;
|
62
|
+
|
63
|
+
/** The length of the string fragment, in bytes (may be zero). */
|
64
|
+
size_t length;
|
65
|
+
} GumboStringPiece;
|
66
|
+
|
67
|
+
#define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 }
|
68
|
+
/** A constant to represent a 0-length null string. */
|
69
|
+
#define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT
|
70
|
+
|
71
|
+
/**
|
72
|
+
* Compares two `GumboStringPiece`s, and returns `true` if they're
|
73
|
+
* equal or `false` otherwise.
|
74
|
+
*/
|
75
|
+
bool gumbo_string_equals (
|
76
|
+
const GumboStringPiece* str1,
|
77
|
+
const GumboStringPiece* str2
|
78
|
+
);
|
79
|
+
|
80
|
+
/**
|
81
|
+
* Compares two `GumboStringPiece`s, ignoring case, and returns `true`
|
82
|
+
* if they're equal or `false` otherwise.
|
83
|
+
*/
|
84
|
+
bool gumbo_string_equals_ignore_case (
|
85
|
+
const GumboStringPiece* str1,
|
86
|
+
const GumboStringPiece* str2
|
87
|
+
);
|
88
|
+
|
89
|
+
/**
|
90
|
+
* Check if the first `GumboStringPiece` is a prefix of the second, ignoring
|
91
|
+
* case.
|
92
|
+
*/
|
93
|
+
bool gumbo_string_prefix_ignore_case (
|
94
|
+
const GumboStringPiece* prefix,
|
95
|
+
const GumboStringPiece* str
|
96
|
+
);
|
97
|
+
|
98
|
+
/**
|
99
|
+
* A simple vector implementation. This stores a pointer to a data array
|
100
|
+
* and a length. All elements are stored as `void*`; client code must
|
101
|
+
* cast to the appropriate type. Overflows upon addition result in
|
102
|
+
* reallocation of the data array, with the size doubling to maintain
|
103
|
+
* `O(1)` amortized cost. There is no removal function, as this isn't
|
104
|
+
* needed for any of the operations within this library. Iteration can
|
105
|
+
* be done through inspecting the structure directly in a `for` loop.
|
106
|
+
*/
|
107
|
+
typedef struct {
|
108
|
+
/**
|
109
|
+
* Data elements. This points to a dynamically-allocated array of
|
110
|
+
* `capacity` elements, each a `void*` to the element itself.
|
111
|
+
*/
|
112
|
+
void** data;
|
113
|
+
|
114
|
+
/** Number of elements currently in the vector. */
|
115
|
+
unsigned int length;
|
116
|
+
|
117
|
+
/** Current array capacity. */
|
118
|
+
unsigned int capacity;
|
119
|
+
} GumboVector;
|
120
|
+
|
121
|
+
# define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 }
|
122
|
+
/** An empty (0-length, 0-capacity) `GumboVector`. */
|
123
|
+
#define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT
|
124
|
+
|
125
|
+
/**
|
126
|
+
* Returns the first index at which an element appears in this vector
|
127
|
+
* (testing by pointer equality), or `-1` if it never does.
|
128
|
+
*/
|
129
|
+
int gumbo_vector_index_of(GumboVector* vector, const void* element);
|
130
|
+
|
131
|
+
/**
|
132
|
+
* An `enum` for all the tags defined in the HTML5 standard. These
|
133
|
+
* correspond to the tag names themselves. Enum constants exist only
|
134
|
+
* for tags that appear in the spec itself (or for tags with special
|
135
|
+
* handling in the SVG and MathML namespaces). Any other tags appear
|
136
|
+
* as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained
|
137
|
+
* through `original_tag`.
|
138
|
+
*
|
139
|
+
* This is mostly for API convenience, so that clients of this library
|
140
|
+
* don't need to perform a `strcasecmp` to find the normalized tag
|
141
|
+
* name. It also has efficiency benefits, by letting the parser work
|
142
|
+
* with enums instead of strings.
|
143
|
+
*/
|
144
|
+
typedef enum {
|
145
|
+
GUMBO_TAG_HTML,
|
146
|
+
GUMBO_TAG_HEAD,
|
147
|
+
GUMBO_TAG_TITLE,
|
148
|
+
GUMBO_TAG_BASE,
|
149
|
+
GUMBO_TAG_LINK,
|
150
|
+
GUMBO_TAG_META,
|
151
|
+
GUMBO_TAG_STYLE,
|
152
|
+
GUMBO_TAG_SCRIPT,
|
153
|
+
GUMBO_TAG_NOSCRIPT,
|
154
|
+
GUMBO_TAG_TEMPLATE,
|
155
|
+
GUMBO_TAG_BODY,
|
156
|
+
GUMBO_TAG_ARTICLE,
|
157
|
+
GUMBO_TAG_SECTION,
|
158
|
+
GUMBO_TAG_NAV,
|
159
|
+
GUMBO_TAG_ASIDE,
|
160
|
+
GUMBO_TAG_H1,
|
161
|
+
GUMBO_TAG_H2,
|
162
|
+
GUMBO_TAG_H3,
|
163
|
+
GUMBO_TAG_H4,
|
164
|
+
GUMBO_TAG_H5,
|
165
|
+
GUMBO_TAG_H6,
|
166
|
+
GUMBO_TAG_HGROUP,
|
167
|
+
GUMBO_TAG_HEADER,
|
168
|
+
GUMBO_TAG_FOOTER,
|
169
|
+
GUMBO_TAG_ADDRESS,
|
170
|
+
GUMBO_TAG_P,
|
171
|
+
GUMBO_TAG_HR,
|
172
|
+
GUMBO_TAG_PRE,
|
173
|
+
GUMBO_TAG_BLOCKQUOTE,
|
174
|
+
GUMBO_TAG_OL,
|
175
|
+
GUMBO_TAG_UL,
|
176
|
+
GUMBO_TAG_LI,
|
177
|
+
GUMBO_TAG_DL,
|
178
|
+
GUMBO_TAG_DT,
|
179
|
+
GUMBO_TAG_DD,
|
180
|
+
GUMBO_TAG_FIGURE,
|
181
|
+
GUMBO_TAG_FIGCAPTION,
|
182
|
+
GUMBO_TAG_MAIN,
|
183
|
+
GUMBO_TAG_DIV,
|
184
|
+
GUMBO_TAG_A,
|
185
|
+
GUMBO_TAG_EM,
|
186
|
+
GUMBO_TAG_STRONG,
|
187
|
+
GUMBO_TAG_SMALL,
|
188
|
+
GUMBO_TAG_S,
|
189
|
+
GUMBO_TAG_CITE,
|
190
|
+
GUMBO_TAG_Q,
|
191
|
+
GUMBO_TAG_DFN,
|
192
|
+
GUMBO_TAG_ABBR,
|
193
|
+
GUMBO_TAG_DATA,
|
194
|
+
GUMBO_TAG_TIME,
|
195
|
+
GUMBO_TAG_CODE,
|
196
|
+
GUMBO_TAG_VAR,
|
197
|
+
GUMBO_TAG_SAMP,
|
198
|
+
GUMBO_TAG_KBD,
|
199
|
+
GUMBO_TAG_SUB,
|
200
|
+
GUMBO_TAG_SUP,
|
201
|
+
GUMBO_TAG_I,
|
202
|
+
GUMBO_TAG_B,
|
203
|
+
GUMBO_TAG_U,
|
204
|
+
GUMBO_TAG_MARK,
|
205
|
+
GUMBO_TAG_RUBY,
|
206
|
+
GUMBO_TAG_RT,
|
207
|
+
GUMBO_TAG_RP,
|
208
|
+
GUMBO_TAG_BDI,
|
209
|
+
GUMBO_TAG_BDO,
|
210
|
+
GUMBO_TAG_SPAN,
|
211
|
+
GUMBO_TAG_BR,
|
212
|
+
GUMBO_TAG_WBR,
|
213
|
+
GUMBO_TAG_INS,
|
214
|
+
GUMBO_TAG_DEL,
|
215
|
+
GUMBO_TAG_IMAGE,
|
216
|
+
GUMBO_TAG_IMG,
|
217
|
+
GUMBO_TAG_IFRAME,
|
218
|
+
GUMBO_TAG_EMBED,
|
219
|
+
GUMBO_TAG_OBJECT,
|
220
|
+
GUMBO_TAG_PARAM,
|
221
|
+
GUMBO_TAG_VIDEO,
|
222
|
+
GUMBO_TAG_AUDIO,
|
223
|
+
GUMBO_TAG_SOURCE,
|
224
|
+
GUMBO_TAG_TRACK,
|
225
|
+
GUMBO_TAG_CANVAS,
|
226
|
+
GUMBO_TAG_MAP,
|
227
|
+
GUMBO_TAG_AREA,
|
228
|
+
GUMBO_TAG_MATH,
|
229
|
+
GUMBO_TAG_MI,
|
230
|
+
GUMBO_TAG_MO,
|
231
|
+
GUMBO_TAG_MN,
|
232
|
+
GUMBO_TAG_MS,
|
233
|
+
GUMBO_TAG_MTEXT,
|
234
|
+
GUMBO_TAG_MGLYPH,
|
235
|
+
GUMBO_TAG_MALIGNMARK,
|
236
|
+
GUMBO_TAG_ANNOTATION_XML,
|
237
|
+
GUMBO_TAG_SVG,
|
238
|
+
GUMBO_TAG_FOREIGNOBJECT,
|
239
|
+
GUMBO_TAG_DESC,
|
240
|
+
GUMBO_TAG_TABLE,
|
241
|
+
GUMBO_TAG_CAPTION,
|
242
|
+
GUMBO_TAG_COLGROUP,
|
243
|
+
GUMBO_TAG_COL,
|
244
|
+
GUMBO_TAG_TBODY,
|
245
|
+
GUMBO_TAG_THEAD,
|
246
|
+
GUMBO_TAG_TFOOT,
|
247
|
+
GUMBO_TAG_TR,
|
248
|
+
GUMBO_TAG_TD,
|
249
|
+
GUMBO_TAG_TH,
|
250
|
+
GUMBO_TAG_FORM,
|
251
|
+
GUMBO_TAG_FIELDSET,
|
252
|
+
GUMBO_TAG_LEGEND,
|
253
|
+
GUMBO_TAG_LABEL,
|
254
|
+
GUMBO_TAG_INPUT,
|
255
|
+
GUMBO_TAG_BUTTON,
|
256
|
+
GUMBO_TAG_SELECT,
|
257
|
+
GUMBO_TAG_DATALIST,
|
258
|
+
GUMBO_TAG_OPTGROUP,
|
259
|
+
GUMBO_TAG_OPTION,
|
260
|
+
GUMBO_TAG_TEXTAREA,
|
261
|
+
GUMBO_TAG_KEYGEN,
|
262
|
+
GUMBO_TAG_OUTPUT,
|
263
|
+
GUMBO_TAG_PROGRESS,
|
264
|
+
GUMBO_TAG_METER,
|
265
|
+
GUMBO_TAG_DETAILS,
|
266
|
+
GUMBO_TAG_SUMMARY,
|
267
|
+
GUMBO_TAG_MENU,
|
268
|
+
GUMBO_TAG_MENUITEM,
|
269
|
+
GUMBO_TAG_APPLET,
|
270
|
+
GUMBO_TAG_ACRONYM,
|
271
|
+
GUMBO_TAG_BGSOUND,
|
272
|
+
GUMBO_TAG_DIR,
|
273
|
+
GUMBO_TAG_FRAME,
|
274
|
+
GUMBO_TAG_FRAMESET,
|
275
|
+
GUMBO_TAG_NOFRAMES,
|
276
|
+
GUMBO_TAG_LISTING,
|
277
|
+
GUMBO_TAG_XMP,
|
278
|
+
GUMBO_TAG_NEXTID,
|
279
|
+
GUMBO_TAG_NOEMBED,
|
280
|
+
GUMBO_TAG_PLAINTEXT,
|
281
|
+
GUMBO_TAG_RB,
|
282
|
+
GUMBO_TAG_STRIKE,
|
283
|
+
GUMBO_TAG_BASEFONT,
|
284
|
+
GUMBO_TAG_BIG,
|
285
|
+
GUMBO_TAG_BLINK,
|
286
|
+
GUMBO_TAG_CENTER,
|
287
|
+
GUMBO_TAG_FONT,
|
288
|
+
GUMBO_TAG_MARQUEE,
|
289
|
+
GUMBO_TAG_MULTICOL,
|
290
|
+
GUMBO_TAG_NOBR,
|
291
|
+
GUMBO_TAG_SPACER,
|
292
|
+
GUMBO_TAG_TT,
|
293
|
+
GUMBO_TAG_RTC,
|
294
|
+
GUMBO_TAG_DIALOG,
|
295
|
+
GUMBO_TAG_SEARCH,
|
296
|
+
// Used for all tags that don't have special handling in HTML.
|
297
|
+
GUMBO_TAG_UNKNOWN,
|
298
|
+
// A marker value to indicate the end of the enum, for iterating over it.
|
299
|
+
GUMBO_TAG_LAST,
|
300
|
+
} GumboTag;
|
301
|
+
|
302
|
+
/**
|
303
|
+
* Returns the normalized (all lower case) tag name for a `GumboTag` enum. The
|
304
|
+
* return value is static data owned by the library.
|
305
|
+
*/
|
306
|
+
const char* gumbo_normalized_tagname(GumboTag tag);
|
307
|
+
|
308
|
+
/**
|
309
|
+
* Extracts the tag name from the `original_text` field of an element
|
310
|
+
* or token by stripping off `</>` characters and attributes and
|
311
|
+
* adjusting the passed-in `GumboStringPiece` appropriately. The tag
|
312
|
+
* name is in the original case and shares a buffer with the original
|
313
|
+
* text, to simplify memory management. Behavior is undefined if a
|
314
|
+
* string piece that doesn't represent an HTML tag (`<tagname>` or
|
315
|
+
* `</tagname>`) is passed in. If the string piece is completely
|
316
|
+
* empty (`NULL` data pointer), then this function will exit
|
317
|
+
* successfully as a no-op.
|
318
|
+
*/
|
319
|
+
void gumbo_tag_from_original_text(GumboStringPiece* text);
|
320
|
+
|
321
|
+
/**
|
322
|
+
* Fixes the case of SVG elements that are not all lowercase. This is
|
323
|
+
* not done at parse time because there's no place to store a mutated
|
324
|
+
* tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most
|
325
|
+
* SVG tags without special handling), while `original_tag_name` is a
|
326
|
+
* pointer into the original buffer. Instead, we provide this helper
|
327
|
+
* function that clients can use to rename SVG tags as appropriate.
|
328
|
+
* Returns the case-normalized SVG tagname if a replacement is found, or
|
329
|
+
* `NULL` if no normalization is called for. The return value is static
|
330
|
+
* data and owned by the library.
|
331
|
+
*
|
332
|
+
* @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign
|
333
|
+
*/
|
334
|
+
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
335
|
+
|
336
|
+
/**
|
337
|
+
* Converts a tag name string (which may be in upper or mixed case) to a
|
338
|
+
* tag enum.
|
339
|
+
*/
|
340
|
+
GumboTag gumbo_tagn_enum(const char* tagname, size_t length);
|
341
|
+
|
342
|
+
/**
|
343
|
+
* Attribute namespaces.
|
344
|
+
* HTML includes special handling for XLink, XML, and XMLNS namespaces
|
345
|
+
* on attributes. Everything else goes in the generic "NONE" namespace.
|
346
|
+
*/
|
347
|
+
typedef enum {
|
348
|
+
GUMBO_ATTR_NAMESPACE_NONE,
|
349
|
+
GUMBO_ATTR_NAMESPACE_XLINK,
|
350
|
+
GUMBO_ATTR_NAMESPACE_XML,
|
351
|
+
GUMBO_ATTR_NAMESPACE_XMLNS,
|
352
|
+
} GumboAttributeNamespaceEnum;
|
353
|
+
|
354
|
+
/**
|
355
|
+
* A struct representing a single attribute on a HTML tag. This is a
|
356
|
+
* name-value pair, but also includes information about source locations
|
357
|
+
* and original source text.
|
358
|
+
*/
|
359
|
+
typedef struct {
|
360
|
+
/**
|
361
|
+
* The namespace for the attribute. This will usually be
|
362
|
+
* `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes
|
363
|
+
* take special values, per:
|
364
|
+
* https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes
|
365
|
+
*/
|
366
|
+
GumboAttributeNamespaceEnum attr_namespace;
|
367
|
+
|
368
|
+
/**
|
369
|
+
* The name of the attribute. This is in a freshly-allocated buffer to
|
370
|
+
* deal with case-normalization and is null-terminated.
|
371
|
+
*/
|
372
|
+
const char* name;
|
373
|
+
|
374
|
+
/**
|
375
|
+
* The original text of the attribute name, as a pointer into the
|
376
|
+
* original source buffer.
|
377
|
+
*/
|
378
|
+
GumboStringPiece original_name;
|
379
|
+
|
380
|
+
/**
|
381
|
+
* The value of the attribute. This is in a freshly-allocated buffer
|
382
|
+
* to deal with unescaping and is null-terminated. It does not include
|
383
|
+
* any quotes that surround the attribute. If the attribute has no
|
384
|
+
* value (for example, `selected` on a checkbox) this will be an empty
|
385
|
+
* string.
|
386
|
+
*/
|
387
|
+
const char* value;
|
388
|
+
|
389
|
+
/**
|
390
|
+
* The original text of the value of the attribute. This points into
|
391
|
+
* the original source buffer. It includes any quotes that surround
|
392
|
+
* the attribute and you can look at `original_value.data[0]` and
|
393
|
+
* `original_value.data[original_value.length - 1]` to determine what
|
394
|
+
* the quote characters were. If the attribute has no value this will
|
395
|
+
* be a 0-length string.
|
396
|
+
*/
|
397
|
+
GumboStringPiece original_value;
|
398
|
+
|
399
|
+
/** The starting position of the attribute name. */
|
400
|
+
GumboSourcePosition name_start;
|
401
|
+
|
402
|
+
/**
|
403
|
+
* The ending position of the attribute name. This is not always derivable
|
404
|
+
* from the starting position of the value because of the possibility of
|
405
|
+
* whitespace around the `=` sign.
|
406
|
+
*/
|
407
|
+
GumboSourcePosition name_end;
|
408
|
+
|
409
|
+
/** The starting position of the attribute value. */
|
410
|
+
GumboSourcePosition value_start;
|
411
|
+
|
412
|
+
/** The ending position of the attribute value. */
|
413
|
+
GumboSourcePosition value_end;
|
414
|
+
} GumboAttribute;
|
415
|
+
|
416
|
+
/**
|
417
|
+
* Given a vector of `GumboAttribute`s, look up the one with the
|
418
|
+
* specified name and return it, or `NULL` if no such attribute exists.
|
419
|
+
* This uses a case-insensitive match, as HTML is case-insensitive.
|
420
|
+
*/
|
421
|
+
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
|
422
|
+
|
423
|
+
/**
|
424
|
+
* Enum denoting the type of node. This determines the type of the
|
425
|
+
* `node.v` union.
|
426
|
+
*/
|
427
|
+
typedef enum {
|
428
|
+
/** Document node. `v` will be a `GumboDocument`. */
|
429
|
+
GUMBO_NODE_DOCUMENT,
|
430
|
+
/** Element node. `v` will be a `GumboElement`. */
|
431
|
+
GUMBO_NODE_ELEMENT,
|
432
|
+
/** Text node. `v` will be a `GumboText`. */
|
433
|
+
GUMBO_NODE_TEXT,
|
434
|
+
/** CDATA node. `v` will be a `GumboText`. */
|
435
|
+
GUMBO_NODE_CDATA,
|
436
|
+
/** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */
|
437
|
+
GUMBO_NODE_COMMENT,
|
438
|
+
/** Text node, where all contents is whitespace. `v` will be a `GumboText`. */
|
439
|
+
GUMBO_NODE_WHITESPACE,
|
440
|
+
/**
|
441
|
+
* Template node. This is separate from `GUMBO_NODE_ELEMENT` because
|
442
|
+
* many client libraries will want to ignore the contents of template
|
443
|
+
* nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will
|
444
|
+
* do the right thing here, while clients that want to include template
|
445
|
+
* contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a
|
446
|
+
* `GumboElement`.
|
447
|
+
*/
|
448
|
+
GUMBO_NODE_TEMPLATE
|
449
|
+
} GumboNodeType;
|
450
|
+
|
451
|
+
/**
|
452
|
+
* Forward declaration of GumboNode so it can be used recursively in
|
453
|
+
* GumboNode.parent.
|
454
|
+
*/
|
455
|
+
typedef struct GumboInternalNode GumboNode;
|
456
|
+
|
457
|
+
/** https://dom.spec.whatwg.org/#concept-document-quirks */
|
458
|
+
typedef enum {
|
459
|
+
GUMBO_DOCTYPE_NO_QUIRKS,
|
460
|
+
GUMBO_DOCTYPE_QUIRKS,
|
461
|
+
GUMBO_DOCTYPE_LIMITED_QUIRKS
|
462
|
+
} GumboQuirksModeEnum;
|
463
|
+
|
464
|
+
/**
|
465
|
+
* Namespaces.
|
466
|
+
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix.
|
467
|
+
* Rather, anything inside an `<svg>` tag is in the SVG namespace,
|
468
|
+
* anything inside the `<math>` tag is in the MathML namespace, and
|
469
|
+
* anything else is inside the HTML namespace. No other namespaces are
|
470
|
+
* supported, so this can be an `enum`.
|
471
|
+
*/
|
472
|
+
typedef enum {
|
473
|
+
GUMBO_NAMESPACE_HTML,
|
474
|
+
GUMBO_NAMESPACE_SVG,
|
475
|
+
GUMBO_NAMESPACE_MATHML
|
476
|
+
} GumboNamespaceEnum;
|
477
|
+
|
478
|
+
/**
|
479
|
+
* Parse flags.
|
480
|
+
* We track the reasons for parser insertion of nodes and store them in
|
481
|
+
* a bitvector in the node itself. This lets client code optimize out
|
482
|
+
* nodes that are implied by the HTML structure of the document, or flag
|
483
|
+
* constructs that may not be allowed by a style guide, or track the
|
484
|
+
* prevalence of incorrect or tricky HTML code.
|
485
|
+
*/
|
486
|
+
typedef enum {
|
487
|
+
/**
|
488
|
+
* A normal node -- both start and end tags appear in the source,
|
489
|
+
* nothing has been reparented.
|
490
|
+
*/
|
491
|
+
GUMBO_INSERTION_NORMAL = 0,
|
492
|
+
|
493
|
+
/**
|
494
|
+
* A node inserted by the parser to fulfill some implicit insertion
|
495
|
+
* rule. This is usually set in addition to some other flag giving a
|
496
|
+
* more specific insertion reason; it's a generic catch-all term
|
497
|
+
* meaning "The start tag for this node did not appear in the document
|
498
|
+
* source".
|
499
|
+
*/
|
500
|
+
GUMBO_INSERTION_BY_PARSER = 1 << 0,
|
501
|
+
|
502
|
+
/**
|
503
|
+
* A flag indicating that the end tag for this node did not appear in
|
504
|
+
* the document source. Note that in some cases, you can still have
|
505
|
+
* parser-inserted nodes with an explicit end tag. For example,
|
506
|
+
* `Text</html>` has `GUMBO_INSERTED_BY_PARSER` set on the `<html>`
|
507
|
+
* node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the
|
508
|
+
* `</html>` tag actually exists.
|
509
|
+
*
|
510
|
+
* This flag will be set only if the end tag is completely missing.
|
511
|
+
* In some cases, the end tag may be misplaced (e.g. a `</body>` tag
|
512
|
+
* with text afterwards), which will leave this flag unset and require
|
513
|
+
* clients to inspect the parse errors for that case.
|
514
|
+
*/
|
515
|
+
GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
|
516
|
+
|
517
|
+
// Value 1 << 2 was for a flag that has since been removed.
|
518
|
+
|
519
|
+
/**
|
520
|
+
* A flag for nodes that are inserted because their presence is
|
521
|
+
* implied by other tags, e.g. `<html>`, `<head>`, `<body>`,
|
522
|
+
* `<tbody>`, etc.
|
523
|
+
*/
|
524
|
+
GUMBO_INSERTION_IMPLIED = 1 << 3,
|
525
|
+
|
526
|
+
/**
|
527
|
+
* A flag for nodes that are converted from their end tag equivalents.
|
528
|
+
* For example, `</p>` when no paragraph is open implies that the
|
529
|
+
* parser should create a `<p>` tag and immediately close it, while
|
530
|
+
* `</br>` means the same thing as `<br>`.
|
531
|
+
*/
|
532
|
+
GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
|
533
|
+
|
534
|
+
// Value 1 << 5 was for a flag that has since been removed.
|
535
|
+
|
536
|
+
/** A flag for `<image>` tags that are rewritten as `<img>`. */
|
537
|
+
GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
|
538
|
+
|
539
|
+
/**
|
540
|
+
* A flag for nodes that are cloned as a result of the reconstruction
|
541
|
+
* of active formatting elements. This is set only on the clone; the
|
542
|
+
* initial portion of the formatting run is a NORMAL node with an
|
543
|
+
* `IMPLICIT_END_TAG`.
|
544
|
+
*/
|
545
|
+
GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
|
546
|
+
|
547
|
+
/** A flag for nodes that are cloned by the adoption agency algorithm. */
|
548
|
+
GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
|
549
|
+
|
550
|
+
/** A flag for nodes that are moved by the adoption agency algorithm. */
|
551
|
+
GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
|
552
|
+
|
553
|
+
/**
|
554
|
+
* A flag for nodes that have been foster-parented out of a table (or
|
555
|
+
* should've been foster-parented, if verbatim mode is set).
|
556
|
+
*/
|
557
|
+
GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
|
558
|
+
} GumboParseFlags;
|
559
|
+
|
560
|
+
/** Information specific to document nodes. */
|
561
|
+
typedef struct {
|
562
|
+
/**
|
563
|
+
* An array of `GumboNode`s, containing the children of this element.
|
564
|
+
* This will normally consist of the `<html>` element and any comment
|
565
|
+
* nodes found. Pointers are owned.
|
566
|
+
*/
|
567
|
+
GumboVector /* GumboNode* */ children;
|
568
|
+
|
569
|
+
/**
|
570
|
+
* `true` if there was an explicit doctype token, as opposed to it
|
571
|
+
* being omitted.
|
572
|
+
*/
|
573
|
+
bool has_doctype;
|
574
|
+
|
575
|
+
// Fields from the doctype token, copied verbatim.
|
576
|
+
const char* name;
|
577
|
+
const char* public_identifier;
|
578
|
+
const char* system_identifier;
|
579
|
+
|
580
|
+
/**
|
581
|
+
* Whether or not the document is in QuirksMode, as determined by the
|
582
|
+
* values in the GumboTokenDocType template.
|
583
|
+
*/
|
584
|
+
GumboQuirksModeEnum doc_type_quirks_mode;
|
585
|
+
} GumboDocument;
|
586
|
+
|
587
|
+
/**
|
588
|
+
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE
|
589
|
+
* elements. This contains just a block of text and its position.
|
590
|
+
*/
|
591
|
+
typedef struct {
|
592
|
+
/**
|
593
|
+
* The text of this node, after entities have been parsed and decoded.
|
594
|
+
* For comment and cdata nodes, this does not include the comment
|
595
|
+
* delimiters.
|
596
|
+
*/
|
597
|
+
const char* text;
|
598
|
+
|
599
|
+
/**
|
600
|
+
* The original text of this node, as a pointer into the original
|
601
|
+
* buffer. For comment/cdata nodes, this includes the comment
|
602
|
+
* delimiters.
|
603
|
+
*/
|
604
|
+
GumboStringPiece original_text;
|
605
|
+
|
606
|
+
/**
|
607
|
+
* The starting position of this node. This corresponds to the
|
608
|
+
* position of `original_text`, before entities are decoded.
|
609
|
+
* */
|
610
|
+
GumboSourcePosition start_pos;
|
611
|
+
} GumboText;
|
612
|
+
|
613
|
+
/**
|
614
|
+
* The struct used to represent all HTML elements. This contains
|
615
|
+
* information about the tag, attributes, and child nodes.
|
616
|
+
*/
|
617
|
+
typedef struct {
|
618
|
+
/**
|
619
|
+
* An array of `GumboNode`s, containing the children of this element.
|
620
|
+
* Pointers are owned.
|
621
|
+
*/
|
622
|
+
GumboVector /* GumboNode* */ children;
|
623
|
+
|
624
|
+
/** The GumboTag enum for this element. */
|
625
|
+
GumboTag tag;
|
626
|
+
|
627
|
+
/** The name for this element. */
|
628
|
+
const char* name;
|
629
|
+
|
630
|
+
/** The GumboNamespaceEnum for this element. */
|
631
|
+
GumboNamespaceEnum tag_namespace;
|
632
|
+
|
633
|
+
/**
|
634
|
+
* A `GumboStringPiece` pointing to the original tag text for this
|
635
|
+
* element, pointing directly into the source buffer. If the tag was
|
636
|
+
* inserted algorithmically (for example, `<head>` or `<tbody>`
|
637
|
+
* insertion), this will be a zero-length string.
|
638
|
+
*/
|
639
|
+
GumboStringPiece original_tag;
|
640
|
+
|
641
|
+
/**
|
642
|
+
* A `GumboStringPiece` pointing to the original end tag text for this
|
643
|
+
* element. If the end tag was inserted algorithmically, (for example,
|
644
|
+
* closing a self-closing tag), this will be a zero-length string.
|
645
|
+
*/
|
646
|
+
GumboStringPiece original_end_tag;
|
647
|
+
|
648
|
+
/** The source position for the start of the start tag. */
|
649
|
+
GumboSourcePosition start_pos;
|
650
|
+
|
651
|
+
/** The source position for the start of the end tag. */
|
652
|
+
GumboSourcePosition end_pos;
|
653
|
+
|
654
|
+
/**
|
655
|
+
* An array of `GumboAttribute`s, containing the attributes for this
|
656
|
+
* tag in the order that they were parsed. Pointers are owned.
|
657
|
+
*/
|
658
|
+
GumboVector /* GumboAttribute* */ attributes;
|
659
|
+
} GumboElement;
|
660
|
+
|
661
|
+
/**
|
662
|
+
* A supertype for `GumboElement` and `GumboText`, so that we can
|
663
|
+
* include one generic type in lists of children and cast as necessary
|
664
|
+
* to subtypes.
|
665
|
+
*/
|
666
|
+
struct GumboInternalNode {
|
667
|
+
/** The type of node that this is. */
|
668
|
+
GumboNodeType type;
|
669
|
+
|
670
|
+
/** Pointer back to parent node. Not owned. */
|
671
|
+
GumboNode* parent;
|
672
|
+
|
673
|
+
/** The index within the parent's children vector of this node. */
|
674
|
+
unsigned int index_within_parent;
|
675
|
+
|
676
|
+
/**
|
677
|
+
* A bitvector of flags containing information about why this element
|
678
|
+
* was inserted into the parse tree, including a variety of special
|
679
|
+
* parse situations.
|
680
|
+
*/
|
681
|
+
GumboParseFlags parse_flags;
|
682
|
+
|
683
|
+
/** The actual node data. */
|
684
|
+
union {
|
685
|
+
GumboDocument document; // For GUMBO_NODE_DOCUMENT.
|
686
|
+
GumboElement element; // For GUMBO_NODE_ELEMENT.
|
687
|
+
GumboText text; // For everything else.
|
688
|
+
} v;
|
689
|
+
};
|
690
|
+
|
691
|
+
/**
|
692
|
+
* Input struct containing configuration options for the parser.
|
693
|
+
* These let you specify alternate memory managers, provide different
|
694
|
+
* error handling, etc. Use `kGumboDefaultOptions` for sensible
|
695
|
+
* defaults and only set what you need.
|
696
|
+
*/
|
697
|
+
typedef struct GumboInternalOptions {
|
698
|
+
/**
|
699
|
+
* The tab-stop size, for computing positions in HTML files that
|
700
|
+
* use tabs. Default: `8`.
|
701
|
+
*/
|
702
|
+
int tab_stop;
|
703
|
+
|
704
|
+
/**
|
705
|
+
* Whether or not to stop parsing when the first error is encountered.
|
706
|
+
* Default: `false`.
|
707
|
+
*/
|
708
|
+
bool stop_on_first_error;
|
709
|
+
|
710
|
+
/**
|
711
|
+
* Maximum allowed number of attributes per element. If this limit is
|
712
|
+
* exceeded, the parser will return early with a partial document and
|
713
|
+
* the returned `GumboOutput` will have its `status` field set to
|
714
|
+
* `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
|
715
|
+
* Default: `400`.
|
716
|
+
*/
|
717
|
+
int max_attributes;
|
718
|
+
|
719
|
+
/**
|
720
|
+
* Maximum allowed depth for the parse tree. If this limit is exceeded,
|
721
|
+
* the parser will return early with a partial document and the returned
|
722
|
+
* `GumboOutput` will have its `status` field set to
|
723
|
+
* `GUMBO_STATUS_TREE_TOO_DEEP`.
|
724
|
+
* Default: `400`.
|
725
|
+
*/
|
726
|
+
unsigned int max_tree_depth;
|
727
|
+
|
728
|
+
/**
|
729
|
+
* The maximum number of errors before the parser stops recording
|
730
|
+
* them. This is provided so that if the page is totally borked, we
|
731
|
+
* don't completely fill up the errors vector and exhaust memory with
|
732
|
+
* useless redundant errors. Set to `-1` to disable the limit.
|
733
|
+
* Default: `-1`.
|
734
|
+
*/
|
735
|
+
int max_errors;
|
736
|
+
|
737
|
+
/**
|
738
|
+
* The fragment context for parsing:
|
739
|
+
* https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
|
740
|
+
*
|
741
|
+
* If `NULL` is passed here, it is assumed to be "no
|
742
|
+
* fragment", i.e. the regular parsing algorithm. Otherwise, pass the
|
743
|
+
* tag name for the intended parent of the parsed fragment. We use the
|
744
|
+
* tag name, namespace, and encoding attribute which are sufficient to
|
745
|
+
* set all of the parsing context needed for fragment parsing.
|
746
|
+
*
|
747
|
+
* Default: `NULL`.
|
748
|
+
*/
|
749
|
+
const char* fragment_context;
|
750
|
+
|
751
|
+
/**
|
752
|
+
* The namespace for the fragment context. This lets client code
|
753
|
+
* differentiate between, say, parsing a `<title>` tag in SVG vs.
|
754
|
+
* parsing it in HTML.
|
755
|
+
*
|
756
|
+
* Default: `GUMBO_NAMESPACE_HTML`.
|
757
|
+
*/
|
758
|
+
GumboNamespaceEnum fragment_namespace;
|
759
|
+
|
760
|
+
/**
|
761
|
+
* The value of the fragment context's `encoding` attribute, if any.
|
762
|
+
* Set to `NULL` for no `encoding` attribute.
|
763
|
+
*
|
764
|
+
* Default: `NULL`.
|
765
|
+
*/
|
766
|
+
const char* fragment_encoding;
|
767
|
+
|
768
|
+
/**
|
769
|
+
* Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can
|
770
|
+
* be looked up using `gumbo_compute_quirks_mode()`.
|
771
|
+
*
|
772
|
+
* Default: `GUMBO_DOCTYPE_NO_QUIRKS`.
|
773
|
+
*/
|
774
|
+
GumboQuirksModeEnum quirks_mode;
|
775
|
+
|
776
|
+
/**
|
777
|
+
* For fragment parsing. Set this to true if the context node has a form
|
778
|
+
* element as an ancestor.
|
779
|
+
*
|
780
|
+
* Default: `false`.
|
781
|
+
*/
|
782
|
+
bool fragment_context_has_form_ancestor;
|
783
|
+
} GumboOptions;
|
784
|
+
|
785
|
+
/** Default options struct; use this with gumbo_parse_with_options. */
|
786
|
+
extern const GumboOptions kGumboDefaultOptions;
|
787
|
+
|
788
|
+
/**
|
789
|
+
* Status code indicating whether parsing finished successfully or
|
790
|
+
* was stopped mid-document due to exceptional circumstances.
|
791
|
+
*/
|
792
|
+
typedef enum {
|
793
|
+
/**
|
794
|
+
* Indicates that parsing completed successfuly. The resulting tree
|
795
|
+
* will be a complete document.
|
796
|
+
*/
|
797
|
+
GUMBO_STATUS_OK,
|
798
|
+
|
799
|
+
/**
|
800
|
+
* Indicates that the maximum element nesting limit
|
801
|
+
* (`GumboOptions::max_tree_depth`) was reached during parsing. The
|
802
|
+
* resulting tree will be a partial document, with no further nodes
|
803
|
+
* created after the point where the limit was reached. The partial
|
804
|
+
* document may be useful for constructing an error message but
|
805
|
+
* typically shouldn't be used for other purposes.
|
806
|
+
*/
|
807
|
+
GUMBO_STATUS_TREE_TOO_DEEP,
|
808
|
+
|
809
|
+
/**
|
810
|
+
* Indicates that the maximum number of attributes per element
|
811
|
+
* (`GumboOptions::max_attributes`) was reached during parsing. The
|
812
|
+
* resulting tree will be a partial document, with no further nodes
|
813
|
+
* created after the point where the limit was reached. The partial
|
814
|
+
* document may be useful for constructing an error message but
|
815
|
+
* typically shouldn't be used for other purposes.
|
816
|
+
*/
|
817
|
+
GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
|
818
|
+
|
819
|
+
// Currently unused
|
820
|
+
GUMBO_STATUS_OUT_OF_MEMORY,
|
821
|
+
} GumboOutputStatus;
|
822
|
+
|
823
|
+
|
824
|
+
/** The output struct containing the results of the parse. */
|
825
|
+
typedef struct GumboInternalOutput {
|
826
|
+
/**
|
827
|
+
* Pointer to the document node. This is a `GumboNode` of type
|
828
|
+
* `NODE_DOCUMENT` that contains the entire document as its child.
|
829
|
+
*/
|
830
|
+
GumboNode* document;
|
831
|
+
|
832
|
+
/**
|
833
|
+
* Pointer to the root node. This is the `<html>` tag that forms the
|
834
|
+
* root of the document.
|
835
|
+
*/
|
836
|
+
GumboNode* root;
|
837
|
+
|
838
|
+
/**
|
839
|
+
* A list of errors that occurred during the parse.
|
840
|
+
*/
|
841
|
+
GumboVector /* GumboError */ errors;
|
842
|
+
|
843
|
+
/**
|
844
|
+
* True if the parser encounted an error.
|
845
|
+
*
|
846
|
+
* This can be true and `errors` an empty `GumboVector` if the `max_errors`
|
847
|
+
* option was set to 0.
|
848
|
+
*/
|
849
|
+
bool document_error;
|
850
|
+
|
851
|
+
/**
|
852
|
+
* A status code indicating whether parsing finished successfully or was
|
853
|
+
* stopped mid-document due to exceptional circumstances.
|
854
|
+
*/
|
855
|
+
GumboOutputStatus status;
|
856
|
+
} GumboOutput;
|
857
|
+
|
858
|
+
/**
|
859
|
+
* Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The
|
860
|
+
* buffer must live at least as long as the parse tree, as some fields
|
861
|
+
* (eg. `original_text`) point directly into the original buffer.
|
862
|
+
*
|
863
|
+
* This doesn't support buffers longer than 4 gigabytes.
|
864
|
+
*/
|
865
|
+
GumboOutput* gumbo_parse(const char* buffer);
|
866
|
+
|
867
|
+
/**
|
868
|
+
* Extended version of `gumbo_parse` that takes an explicit options
|
869
|
+
* structure, buffer, and length.
|
870
|
+
*/
|
871
|
+
GumboOutput* gumbo_parse_with_options (
|
872
|
+
const GumboOptions* options,
|
873
|
+
const char* buffer,
|
874
|
+
size_t buffer_length
|
875
|
+
);
|
876
|
+
|
877
|
+
/**
|
878
|
+
* Compute the quirks mode based on the name, public identifier, and system
|
879
|
+
* identifier. Any of these may be `NULL` to indicate a missing value.
|
880
|
+
*/
|
881
|
+
GumboQuirksModeEnum gumbo_compute_quirks_mode (
|
882
|
+
const char *name,
|
883
|
+
const char *pubid,
|
884
|
+
const char *sysid
|
885
|
+
);
|
886
|
+
|
887
|
+
/** Convert a `GumboOutputStatus` code into a readable description. */
|
888
|
+
const char* gumbo_status_to_string(GumboOutputStatus status);
|
889
|
+
|
890
|
+
/** Release the memory used for the parse tree and parse errors. */
|
891
|
+
void gumbo_destroy_output(GumboOutput* output);
|
892
|
+
|
893
|
+
/** Opaque GumboError type */
|
894
|
+
typedef struct GumboInternalError GumboError;
|
895
|
+
|
896
|
+
/**
|
897
|
+
* Returns the position of the error.
|
898
|
+
*/
|
899
|
+
GumboSourcePosition gumbo_error_position(const GumboError* error);
|
900
|
+
|
901
|
+
/**
|
902
|
+
* Returns a constant string representation of the error's code. This is owned
|
903
|
+
* by the library and should not be freed by the caller.
|
904
|
+
*/
|
905
|
+
const char* gumbo_error_code(const GumboError* error);
|
906
|
+
|
907
|
+
/**
|
908
|
+
* Prints an error to a string. This stores a freshly-allocated buffer
|
909
|
+
* containing the error message text in output. The caller is responsible for
|
910
|
+
* freeing the buffer. The size of the error message is returned. The error
|
911
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
912
|
+
* returned size must be used.
|
913
|
+
*/
|
914
|
+
size_t gumbo_error_to_string(const GumboError* error, char **output);
|
915
|
+
|
916
|
+
/**
|
917
|
+
* Prints a caret diagnostic to a string. This stores a freshly-allocated
|
918
|
+
* buffer containing the error message text in output. The caller is responsible for
|
919
|
+
* freeing the buffer. The size of the error message is returned. The error
|
920
|
+
* message itself may not be NULL-terminated and may contain NULL bytes so the
|
921
|
+
* returned size must be used.
|
922
|
+
*/
|
923
|
+
size_t gumbo_caret_diagnostic_to_string (
|
924
|
+
const GumboError* error,
|
925
|
+
const char* source_text,
|
926
|
+
size_t source_length,
|
927
|
+
char** output
|
928
|
+
);
|
929
|
+
|
930
|
+
/**
|
931
|
+
* Like gumbo_caret_diagnostic_to_string, but prints the text to stdout
|
932
|
+
* instead of writing to a string.
|
933
|
+
*/
|
934
|
+
void gumbo_print_caret_diagnostic (
|
935
|
+
const GumboError* error,
|
936
|
+
const char* source_text,
|
937
|
+
size_t source_length
|
938
|
+
);
|
939
|
+
|
940
|
+
#ifdef __cplusplus
|
941
|
+
}
|
942
|
+
#endif
|
943
|
+
|
944
|
+
#endif // GUMBO_H
|