nokogumbo 0.4 → 0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +0 -3
- data/Rakefile +10 -9
- metadata +3 -29
- data/work/attribute.c +0 -44
- data/work/attribute.h +0 -37
- data/work/char_ref.c +0 -2561
- data/work/char_ref.h +0 -61
- data/work/error.c +0 -258
- data/work/error.h +0 -225
- data/work/gumbo.h +0 -800
- data/work/insertion_mode.h +0 -54
- data/work/nokogumbo.c +0 -254
- data/work/parser.c +0 -3893
- data/work/parser.h +0 -57
- data/work/string_buffer.c +0 -106
- data/work/string_buffer.h +0 -82
- data/work/string_piece.c +0 -49
- data/work/string_piece.h +0 -39
- data/work/tag.c +0 -222
- data/work/token_type.h +0 -40
- data/work/tokenizer.c +0 -2978
- data/work/tokenizer.h +0 -123
- data/work/tokenizer_states.h +0 -103
- data/work/utf8.c +0 -268
- data/work/utf8.h +0 -127
- data/work/util.c +0 -58
- data/work/util.h +0 -57
- data/work/vector.c +0 -121
- data/work/vector.h +0 -66
data/work/gumbo.h
DELETED
@@ -1,800 +0,0 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
//
|
17
|
-
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
|
18
|
-
// GUMBO_ as a prefix for enum constants (static constants get the Google-style
|
19
|
-
// kGumbo prefix).
|
20
|
-
|
21
|
-
/**
|
22
|
-
* @file
|
23
|
-
* @mainpage Gumbo HTML Parser
|
24
|
-
*
|
25
|
-
* This provides a conformant, no-dependencies implementation of the HTML5
|
26
|
-
* parsing algorithm. It supports only UTF8; if you need to parse a different
|
27
|
-
* encoding, run a preprocessing step to convert to UTF8. It returns a parse
|
28
|
-
* tree made of the structs in this file.
|
29
|
-
*
|
30
|
-
* Example:
|
31
|
-
* @code
|
32
|
-
* GumboOutput* output = gumbo_parse(input);
|
33
|
-
* do_something_with_doctype(output->document);
|
34
|
-
* do_something_with_html_tree(output->root);
|
35
|
-
* gumbo_destroy_output(&options, output);
|
36
|
-
* @endcode
|
37
|
-
* HTML5 Spec:
|
38
|
-
*
|
39
|
-
* http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
|
40
|
-
*/
|
41
|
-
|
42
|
-
#ifndef GUMBO_GUMBO_H_
|
43
|
-
#define GUMBO_GUMBO_H_
|
44
|
-
|
45
|
-
#include <stdbool.h>
|
46
|
-
#include <stddef.h>
|
47
|
-
|
48
|
-
#ifdef __cplusplus
|
49
|
-
extern "C" {
|
50
|
-
#endif
|
51
|
-
|
52
|
-
/**
|
53
|
-
* A struct representing a character position within the original text buffer.
|
54
|
-
* Line and column numbers are 1-based and offsets are 0-based, which matches
|
55
|
-
* how most editors and command-line tools work. Also, columns measure
|
56
|
-
* positions in terms of characters while offsets measure by bytes; this is
|
57
|
-
* because the offset field is often used to pull out a particular region of
|
58
|
-
* text (which in most languages that bind to C implies pointer arithmetic on a
|
59
|
-
* buffer of bytes), while the column field is often used to reference a
|
60
|
-
* particular column on a printable display, which nowadays is usually UTF-8.
|
61
|
-
*/
|
62
|
-
typedef struct _GumboSourcePosition {
|
63
|
-
unsigned int line;
|
64
|
-
unsigned int column;
|
65
|
-
unsigned int offset;
|
66
|
-
} GumboSourcePosition;
|
67
|
-
|
68
|
-
/**
|
69
|
-
* A SourcePosition used for elements that have no source position, i.e.
|
70
|
-
* parser-inserted elements.
|
71
|
-
*/
|
72
|
-
extern const GumboSourcePosition kGumboEmptySourcePosition;
|
73
|
-
|
74
|
-
|
75
|
-
/**
|
76
|
-
* A struct representing a string or part of a string. Strings within the
|
77
|
-
* parser are represented by a char* and a length; the char* points into
|
78
|
-
* an existing data buffer owned by some other code (often the original input).
|
79
|
-
* GumboStringPieces are assumed (by convention) to be immutable, because they
|
80
|
-
* may share data. Use GumboStringBuffer if you need to construct a string.
|
81
|
-
* Clients should assume that it is not NUL-terminated, and should always use
|
82
|
-
* explicit lengths when manipulating them.
|
83
|
-
*/
|
84
|
-
typedef struct _GumboStringPiece {
|
85
|
-
/** A pointer to the beginning of the string. NULL iff length == 0. */
|
86
|
-
const char* data;
|
87
|
-
|
88
|
-
/** The length of the string fragment, in bytes. May be zero. */
|
89
|
-
size_t length;
|
90
|
-
} GumboStringPiece;
|
91
|
-
|
92
|
-
/** A constant to represent a 0-length null string. */
|
93
|
-
extern const GumboStringPiece kGumboEmptyString;
|
94
|
-
|
95
|
-
/**
|
96
|
-
* Compares two GumboStringPieces, and returns true if they're equal or false
|
97
|
-
* otherwise.
|
98
|
-
*/
|
99
|
-
bool gumbo_string_equals(
|
100
|
-
const GumboStringPiece* str1, const GumboStringPiece* str2);
|
101
|
-
|
102
|
-
/**
|
103
|
-
* Compares two GumboStringPieces ignoring case, and returns true if they're
|
104
|
-
* equal or false otherwise.
|
105
|
-
*/
|
106
|
-
bool gumbo_string_equals_ignore_case(
|
107
|
-
const GumboStringPiece* str1, const GumboStringPiece* str2);
|
108
|
-
|
109
|
-
|
110
|
-
/**
|
111
|
-
* A simple vector implementation. This stores a pointer to a data array and a
|
112
|
-
* length. All elements are stored as void*; client code must cast to the
|
113
|
-
* appropriate type. Overflows upon addition result in reallocation of the data
|
114
|
-
* array, with the size doubling to maintain O(1) amortized cost. There is no
|
115
|
-
* removal function, as this isn't needed for any of the operations within this
|
116
|
-
* library. Iteration can be done through inspecting the structure directly in
|
117
|
-
* a for-loop.
|
118
|
-
*/
|
119
|
-
typedef struct _GumboVector {
|
120
|
-
/** Data elements. This points to a dynamically-allocated array of capacity
|
121
|
-
* elements, each a void* to the element itself.
|
122
|
-
*/
|
123
|
-
void** data;
|
124
|
-
|
125
|
-
/** Number of elements currently in the vector. */
|
126
|
-
unsigned int length;
|
127
|
-
|
128
|
-
/** Current array capacity. */
|
129
|
-
unsigned int capacity;
|
130
|
-
} GumboVector;
|
131
|
-
|
132
|
-
/** An empty (0-length, 0-capacity) GumboVector. */
|
133
|
-
extern const GumboVector kGumboEmptyVector;
|
134
|
-
|
135
|
-
/**
|
136
|
-
* Returns the first index at which an element appears in this vector (testing
|
137
|
-
* by pointer equality), or -1 if it never does.
|
138
|
-
*/
|
139
|
-
int gumbo_vector_index_of(GumboVector* vector, void* element);
|
140
|
-
|
141
|
-
|
142
|
-
/**
|
143
|
-
* An enum for all the tags defined in the HTML5 standard. These correspond to
|
144
|
-
* the tag names themselves. Enum constants exist only for tags which appear in
|
145
|
-
* the spec itself (or for tags with special handling in the SVG and MathML
|
146
|
-
* namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
|
147
|
-
* name can be obtained through original_tag.
|
148
|
-
*
|
149
|
-
* This is mostly for API convenience, so that clients of this library don't
|
150
|
-
* need to perform a strcasecmp to find the normalized tag name. It also has
|
151
|
-
* efficiency benefits, by letting the parser work with enums instead of
|
152
|
-
* strings.
|
153
|
-
*/
|
154
|
-
typedef enum _GumboTag {
|
155
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
|
156
|
-
GUMBO_TAG_HTML,
|
157
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
|
158
|
-
GUMBO_TAG_HEAD,
|
159
|
-
GUMBO_TAG_TITLE,
|
160
|
-
GUMBO_TAG_BASE,
|
161
|
-
GUMBO_TAG_LINK,
|
162
|
-
GUMBO_TAG_META,
|
163
|
-
GUMBO_TAG_STYLE,
|
164
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
|
165
|
-
GUMBO_TAG_SCRIPT,
|
166
|
-
GUMBO_TAG_NOSCRIPT,
|
167
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
|
168
|
-
GUMBO_TAG_BODY,
|
169
|
-
GUMBO_TAG_SECTION,
|
170
|
-
GUMBO_TAG_NAV,
|
171
|
-
GUMBO_TAG_ARTICLE,
|
172
|
-
GUMBO_TAG_ASIDE,
|
173
|
-
GUMBO_TAG_H1,
|
174
|
-
GUMBO_TAG_H2,
|
175
|
-
GUMBO_TAG_H3,
|
176
|
-
GUMBO_TAG_H4,
|
177
|
-
GUMBO_TAG_H5,
|
178
|
-
GUMBO_TAG_H6,
|
179
|
-
GUMBO_TAG_HGROUP,
|
180
|
-
GUMBO_TAG_HEADER,
|
181
|
-
GUMBO_TAG_FOOTER,
|
182
|
-
GUMBO_TAG_ADDRESS,
|
183
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
|
184
|
-
GUMBO_TAG_P,
|
185
|
-
GUMBO_TAG_HR,
|
186
|
-
GUMBO_TAG_PRE,
|
187
|
-
GUMBO_TAG_BLOCKQUOTE,
|
188
|
-
GUMBO_TAG_OL,
|
189
|
-
GUMBO_TAG_UL,
|
190
|
-
GUMBO_TAG_LI,
|
191
|
-
GUMBO_TAG_DL,
|
192
|
-
GUMBO_TAG_DT,
|
193
|
-
GUMBO_TAG_DD,
|
194
|
-
GUMBO_TAG_FIGURE,
|
195
|
-
GUMBO_TAG_FIGCAPTION,
|
196
|
-
GUMBO_TAG_DIV,
|
197
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
|
198
|
-
GUMBO_TAG_A,
|
199
|
-
GUMBO_TAG_EM,
|
200
|
-
GUMBO_TAG_STRONG,
|
201
|
-
GUMBO_TAG_SMALL,
|
202
|
-
GUMBO_TAG_S,
|
203
|
-
GUMBO_TAG_CITE,
|
204
|
-
GUMBO_TAG_Q,
|
205
|
-
GUMBO_TAG_DFN,
|
206
|
-
GUMBO_TAG_ABBR,
|
207
|
-
GUMBO_TAG_TIME,
|
208
|
-
GUMBO_TAG_CODE,
|
209
|
-
GUMBO_TAG_VAR,
|
210
|
-
GUMBO_TAG_SAMP,
|
211
|
-
GUMBO_TAG_KBD,
|
212
|
-
GUMBO_TAG_SUB,
|
213
|
-
GUMBO_TAG_SUP,
|
214
|
-
GUMBO_TAG_I,
|
215
|
-
GUMBO_TAG_B,
|
216
|
-
GUMBO_TAG_MARK,
|
217
|
-
GUMBO_TAG_RUBY,
|
218
|
-
GUMBO_TAG_RT,
|
219
|
-
GUMBO_TAG_RP,
|
220
|
-
GUMBO_TAG_BDI,
|
221
|
-
GUMBO_TAG_BDO,
|
222
|
-
GUMBO_TAG_SPAN,
|
223
|
-
GUMBO_TAG_BR,
|
224
|
-
GUMBO_TAG_WBR,
|
225
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
|
226
|
-
GUMBO_TAG_INS,
|
227
|
-
GUMBO_TAG_DEL,
|
228
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
|
229
|
-
GUMBO_TAG_IMAGE,
|
230
|
-
GUMBO_TAG_IMG,
|
231
|
-
GUMBO_TAG_IFRAME,
|
232
|
-
GUMBO_TAG_EMBED,
|
233
|
-
GUMBO_TAG_OBJECT,
|
234
|
-
GUMBO_TAG_PARAM,
|
235
|
-
GUMBO_TAG_VIDEO,
|
236
|
-
GUMBO_TAG_AUDIO,
|
237
|
-
GUMBO_TAG_SOURCE,
|
238
|
-
GUMBO_TAG_TRACK,
|
239
|
-
GUMBO_TAG_CANVAS,
|
240
|
-
GUMBO_TAG_MAP,
|
241
|
-
GUMBO_TAG_AREA,
|
242
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
|
243
|
-
GUMBO_TAG_MATH,
|
244
|
-
GUMBO_TAG_MI,
|
245
|
-
GUMBO_TAG_MO,
|
246
|
-
GUMBO_TAG_MN,
|
247
|
-
GUMBO_TAG_MS,
|
248
|
-
GUMBO_TAG_MTEXT,
|
249
|
-
GUMBO_TAG_MGLYPH,
|
250
|
-
GUMBO_TAG_MALIGNMARK,
|
251
|
-
GUMBO_TAG_ANNOTATION_XML,
|
252
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
|
253
|
-
GUMBO_TAG_SVG,
|
254
|
-
GUMBO_TAG_FOREIGNOBJECT,
|
255
|
-
GUMBO_TAG_DESC,
|
256
|
-
// SVG title tags will have GUMBO_TAG_TITLE as with HTML.
|
257
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
|
258
|
-
GUMBO_TAG_TABLE,
|
259
|
-
GUMBO_TAG_CAPTION,
|
260
|
-
GUMBO_TAG_COLGROUP,
|
261
|
-
GUMBO_TAG_COL,
|
262
|
-
GUMBO_TAG_TBODY,
|
263
|
-
GUMBO_TAG_THEAD,
|
264
|
-
GUMBO_TAG_TFOOT,
|
265
|
-
GUMBO_TAG_TR,
|
266
|
-
GUMBO_TAG_TD,
|
267
|
-
GUMBO_TAG_TH,
|
268
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
|
269
|
-
GUMBO_TAG_FORM,
|
270
|
-
GUMBO_TAG_FIELDSET,
|
271
|
-
GUMBO_TAG_LEGEND,
|
272
|
-
GUMBO_TAG_LABEL,
|
273
|
-
GUMBO_TAG_INPUT,
|
274
|
-
GUMBO_TAG_BUTTON,
|
275
|
-
GUMBO_TAG_SELECT,
|
276
|
-
GUMBO_TAG_DATALIST,
|
277
|
-
GUMBO_TAG_OPTGROUP,
|
278
|
-
GUMBO_TAG_OPTION,
|
279
|
-
GUMBO_TAG_TEXTAREA,
|
280
|
-
GUMBO_TAG_KEYGEN,
|
281
|
-
GUMBO_TAG_OUTPUT,
|
282
|
-
GUMBO_TAG_PROGRESS,
|
283
|
-
GUMBO_TAG_METER,
|
284
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
|
285
|
-
GUMBO_TAG_DETAILS,
|
286
|
-
GUMBO_TAG_SUMMARY,
|
287
|
-
GUMBO_TAG_COMMAND,
|
288
|
-
GUMBO_TAG_MENU,
|
289
|
-
// Non-conforming elements that nonetheless appear in the HTML5 spec.
|
290
|
-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
|
291
|
-
GUMBO_TAG_APPLET,
|
292
|
-
GUMBO_TAG_ACRONYM,
|
293
|
-
GUMBO_TAG_BGSOUND,
|
294
|
-
GUMBO_TAG_DIR,
|
295
|
-
GUMBO_TAG_FRAME,
|
296
|
-
GUMBO_TAG_FRAMESET,
|
297
|
-
GUMBO_TAG_NOFRAMES,
|
298
|
-
GUMBO_TAG_ISINDEX,
|
299
|
-
GUMBO_TAG_LISTING,
|
300
|
-
GUMBO_TAG_XMP,
|
301
|
-
GUMBO_TAG_NEXTID,
|
302
|
-
GUMBO_TAG_NOEMBED,
|
303
|
-
GUMBO_TAG_PLAINTEXT,
|
304
|
-
GUMBO_TAG_RB,
|
305
|
-
GUMBO_TAG_STRIKE,
|
306
|
-
GUMBO_TAG_BASEFONT,
|
307
|
-
GUMBO_TAG_BIG,
|
308
|
-
GUMBO_TAG_BLINK,
|
309
|
-
GUMBO_TAG_CENTER,
|
310
|
-
GUMBO_TAG_FONT,
|
311
|
-
GUMBO_TAG_MARQUEE,
|
312
|
-
GUMBO_TAG_MULTICOL,
|
313
|
-
GUMBO_TAG_NOBR,
|
314
|
-
GUMBO_TAG_SPACER,
|
315
|
-
GUMBO_TAG_TT,
|
316
|
-
GUMBO_TAG_U,
|
317
|
-
// Used for all tags that don't have special handling in HTML.
|
318
|
-
GUMBO_TAG_UNKNOWN,
|
319
|
-
// A marker value to indicate the end of the enum, for iterating over it.
|
320
|
-
// Also used as the terminator for varargs functions that take tags.
|
321
|
-
GUMBO_TAG_LAST,
|
322
|
-
} GumboTag;
|
323
|
-
|
324
|
-
/**
|
325
|
-
* Returns the normalized (usually all-lowercased, except for foreign content)
|
326
|
-
* tag name for an GumboTag enum. Return value is static data owned by the
|
327
|
-
* library.
|
328
|
-
*/
|
329
|
-
const char* gumbo_normalized_tagname(GumboTag tag);
|
330
|
-
|
331
|
-
/**
|
332
|
-
* Extracts the tag name from the original_text field of an element or token by
|
333
|
-
* stripping off </> characters and attributes and adjusting the passed-in
|
334
|
-
* GumboStringPiece appropriately. The tag name is in the original case and
|
335
|
-
* shares a buffer with the original text, to simplify memory management.
|
336
|
-
* Behavior is undefined if a string-piece that doesn't represent an HTML tag
|
337
|
-
* (<tagname> or </tagname>) is passed in. If the string piece is completely
|
338
|
-
* empty (NULL data pointer), then this function will exit successfully as a
|
339
|
-
* no-op.
|
340
|
-
*/
|
341
|
-
void gumbo_tag_from_original_text(GumboStringPiece* text);
|
342
|
-
|
343
|
-
/**
|
344
|
-
* Fixes the case of SVG elements that are not all lowercase.
|
345
|
-
* http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
|
346
|
-
* This is not done at parse time because there's no place to store a mutated
|
347
|
-
* tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
|
348
|
-
* without special handling), while original_tag_name is a pointer into the
|
349
|
-
* original buffer. Instead, we provide this helper function that clients can
|
350
|
-
* use to rename SVG tags as appropriate.
|
351
|
-
* Returns the case-normalized SVG tagname if a replacement is found, or NULL if
|
352
|
-
* no normalization is called for. The return value is static data and owned by
|
353
|
-
* the library.
|
354
|
-
*/
|
355
|
-
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
356
|
-
|
357
|
-
/**
|
358
|
-
* Converts a tag name string (which may be in upper or mixed case) to a tag
|
359
|
-
* enum.
|
360
|
-
*/
|
361
|
-
GumboTag gumbo_tag_enum(const char* tagname);
|
362
|
-
|
363
|
-
/**
|
364
|
-
* Attribute namespaces.
|
365
|
-
* HTML includes special handling for XLink, XML, and XMLNS namespaces on
|
366
|
-
* attributes. Everything else goes in the generatic "NONE" namespace.
|
367
|
-
*/
|
368
|
-
typedef enum _GumboAttributeNamespaceEnum {
|
369
|
-
GUMBO_ATTR_NAMESPACE_NONE,
|
370
|
-
GUMBO_ATTR_NAMESPACE_XLINK,
|
371
|
-
GUMBO_ATTR_NAMESPACE_XML,
|
372
|
-
GUMBO_ATTR_NAMESPACE_XMLNS,
|
373
|
-
} GumboAttributeNamespaceEnum;
|
374
|
-
|
375
|
-
/**
|
376
|
-
* A struct representing a single attribute on an HTML tag. This is a
|
377
|
-
* name-value pair, but also includes information about source locations and
|
378
|
-
* original source text.
|
379
|
-
*/
|
380
|
-
typedef struct _GumboAttribute {
|
381
|
-
/**
|
382
|
-
* The namespace for the attribute. This will usually be
|
383
|
-
* GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
|
384
|
-
* values, per:
|
385
|
-
* http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
|
386
|
-
*/
|
387
|
-
GumboAttributeNamespaceEnum attr_namespace;
|
388
|
-
|
389
|
-
/**
|
390
|
-
* The name of the attribute. This is in a freshly-allocated buffer to deal
|
391
|
-
* with case-normalization, and is null-terminated.
|
392
|
-
*/
|
393
|
-
const char* name;
|
394
|
-
|
395
|
-
/**
|
396
|
-
* The original text of the attribute name, as a pointer into the original
|
397
|
-
* source buffer.
|
398
|
-
*/
|
399
|
-
GumboStringPiece original_name;
|
400
|
-
|
401
|
-
/**
|
402
|
-
* The value of the attribute. This is in a freshly-allocated buffer to deal
|
403
|
-
* with unescaping, and is null-terminated. It does not include any quotes
|
404
|
-
* that surround the attribute. If the attribute has no value (for example,
|
405
|
-
* 'selected' on a checkbox), this will be an empty string.
|
406
|
-
*/
|
407
|
-
const char* value;
|
408
|
-
|
409
|
-
/**
|
410
|
-
* The original text of the value of the attribute. This points into the
|
411
|
-
* original source buffer. It includes any quotes that surround the
|
412
|
-
* attribute, and you can look at original_value.data[0] and
|
413
|
-
* original_value.data[original_value.length - 1] to determine what the quote
|
414
|
-
* characters were. If the attribute has no value, this will be a 0-length
|
415
|
-
* string.
|
416
|
-
*/
|
417
|
-
GumboStringPiece original_value;
|
418
|
-
|
419
|
-
/** The starting position of the attribute name. */
|
420
|
-
GumboSourcePosition name_start;
|
421
|
-
|
422
|
-
/**
|
423
|
-
* The ending position of the attribute name. This is not always derivable
|
424
|
-
* from the starting position of the value because of the possibility of
|
425
|
-
* whitespace around the = sign.
|
426
|
-
*/
|
427
|
-
GumboSourcePosition name_end;
|
428
|
-
|
429
|
-
/** The starting position of the attribute value. */
|
430
|
-
GumboSourcePosition value_start;
|
431
|
-
|
432
|
-
/** The ending position of the attribute value. */
|
433
|
-
GumboSourcePosition value_end;
|
434
|
-
} GumboAttribute;
|
435
|
-
|
436
|
-
/**
|
437
|
-
* Given a vector of GumboAttributes, look up the one with the specified name
|
438
|
-
* and return it, or NULL if no such attribute exists. This uses a
|
439
|
-
* case-insensitive match, as HTML is case-insensitive.
|
440
|
-
*/
|
441
|
-
GumboAttribute* gumbo_get_attribute(
|
442
|
-
const struct _GumboVector* attrs, const char* name);
|
443
|
-
|
444
|
-
/**
|
445
|
-
* Enum denoting the type of node. This determines the type of the node.v
|
446
|
-
* union.
|
447
|
-
*/
|
448
|
-
typedef enum _GumboNodeType {
|
449
|
-
/** Document node. v will be a GumboDocument. */
|
450
|
-
GUMBO_NODE_DOCUMENT,
|
451
|
-
/** Element node. v will be a GumboElement. */
|
452
|
-
GUMBO_NODE_ELEMENT,
|
453
|
-
/** Text node. v will be a GumboText. */
|
454
|
-
GUMBO_NODE_TEXT,
|
455
|
-
/** CDATA node. v will be a GumboText. */
|
456
|
-
GUMBO_NODE_CDATA,
|
457
|
-
/** Comment node. v. will be a GumboText, excluding comment delimiters. */
|
458
|
-
GUMBO_NODE_COMMENT,
|
459
|
-
/** Text node, where all contents is whitespace. v will be a GumboText. */
|
460
|
-
GUMBO_NODE_WHITESPACE
|
461
|
-
} GumboNodeType;
|
462
|
-
|
463
|
-
/**
|
464
|
-
* Forward declaration of GumboNode so it can be used recursively in
|
465
|
-
* GumboNode.parent.
|
466
|
-
*/
|
467
|
-
typedef struct _GumboNode GumboNode;
|
468
|
-
|
469
|
-
/** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
|
470
|
-
typedef enum _GumboQuirksModeEnum {
|
471
|
-
GUMBO_DOCTYPE_NO_QUIRKS,
|
472
|
-
GUMBO_DOCTYPE_QUIRKS,
|
473
|
-
GUMBO_DOCTYPE_LIMITED_QUIRKS
|
474
|
-
} GumboQuirksModeEnum;
|
475
|
-
|
476
|
-
/**
|
477
|
-
* Namespaces.
|
478
|
-
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
|
479
|
-
* anything inside an <svg> tag is in the SVG namespace, anything inside the
|
480
|
-
* <math> tag is in the MathML namespace, and anything else is inside the HTML
|
481
|
-
* namespace. No other namespaces are supported, so this can be an enum only.
|
482
|
-
*/
|
483
|
-
typedef enum _GumboNamespaceEnum {
|
484
|
-
GUMBO_NAMESPACE_HTML,
|
485
|
-
GUMBO_NAMESPACE_SVG,
|
486
|
-
GUMBO_NAMESPACE_MATHML
|
487
|
-
} GumboNamespaceEnum;
|
488
|
-
|
489
|
-
/**
|
490
|
-
* Parse flags.
|
491
|
-
* We track the reasons for parser insertion of nodes and store them in a
|
492
|
-
* bitvector in the node itself. This lets client code optimize out nodes that
|
493
|
-
* are implied by the HTML structure of the document, or flag constructs that
|
494
|
-
* may not be allowed by a style guide, or track the prevalence of incorrect or
|
495
|
-
* tricky HTML code.
|
496
|
-
*/
|
497
|
-
typedef enum _GumboParseFlags {
|
498
|
-
/**
|
499
|
-
* A normal node - both start and end tags appear in the source, nothing has
|
500
|
-
* been reparented.
|
501
|
-
*/
|
502
|
-
GUMBO_INSERTION_NORMAL = 0,
|
503
|
-
|
504
|
-
/**
|
505
|
-
* A node inserted by the parser to fulfill some implicit insertion rule.
|
506
|
-
* This is usually set in addition to some other flag giving a more specific
|
507
|
-
* insertion reason; it's a generic catch-all term meaning "The start tag for
|
508
|
-
* this node did not appear in the document source".
|
509
|
-
*/
|
510
|
-
GUMBO_INSERTION_BY_PARSER = 1 << 0,
|
511
|
-
|
512
|
-
/**
|
513
|
-
* A flag indicating that the end tag for this node did not appear in the
|
514
|
-
* document source. Note that in some cases, you can still have
|
515
|
-
* parser-inserted nodes with an explicit end tag: for example, "Text</html>"
|
516
|
-
* has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
|
517
|
-
* GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
|
518
|
-
* exists. This flag will be set only if the end tag is completely missing;
|
519
|
-
* in some cases, the end tag may be misplaced (eg. a </body> tag with text
|
520
|
-
* afterwards), which will leave this flag unset and require clients to
|
521
|
-
* inspect the parse errors for that case.
|
522
|
-
*/
|
523
|
-
GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
|
524
|
-
|
525
|
-
// Value 1 << 2 was for a flag that has since been removed.
|
526
|
-
|
527
|
-
/**
|
528
|
-
* A flag for nodes that are inserted because their presence is implied by
|
529
|
-
* other tags, eg. <html>, <head>, <body>, <tbody>, etc.
|
530
|
-
*/
|
531
|
-
GUMBO_INSERTION_IMPLIED = 1 << 3,
|
532
|
-
|
533
|
-
/**
|
534
|
-
* A flag for nodes that are converted from their end tag equivalents. For
|
535
|
-
* example, </p> when no paragraph is open implies that the parser should
|
536
|
-
* create a <p> tag and immediately close it, while </br> means the same thing
|
537
|
-
* as <br>.
|
538
|
-
*/
|
539
|
-
GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
|
540
|
-
|
541
|
-
/** A flag for nodes that are converted from the parse of an <isindex> tag. */
|
542
|
-
GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
|
543
|
-
|
544
|
-
/** A flag for <image> tags that are rewritten as <img>. */
|
545
|
-
GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
|
546
|
-
|
547
|
-
/**
|
548
|
-
* A flag for nodes that are cloned as a result of the reconstruction of
|
549
|
-
* active formatting elements. This is set only on the clone; the initial
|
550
|
-
* portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
|
551
|
-
*/
|
552
|
-
GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
|
553
|
-
|
554
|
-
/** A flag for nodes that are cloned by the adoption agency algorithm. */
|
555
|
-
GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
|
556
|
-
|
557
|
-
/** A flag for nodes that are moved by the adoption agency algorithm. */
|
558
|
-
GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
|
559
|
-
|
560
|
-
/**
|
561
|
-
* A flag for nodes that have been foster-parented out of a table (or
|
562
|
-
* should've been foster-parented, if verbatim mode is set).
|
563
|
-
*/
|
564
|
-
GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
|
565
|
-
} GumboParseFlags;
|
566
|
-
|
567
|
-
|
568
|
-
/**
|
569
|
-
* Information specific to document nodes.
|
570
|
-
*/
|
571
|
-
typedef struct _GumboDocument {
|
572
|
-
/**
|
573
|
-
* An array of GumboNodes, containing the children of this element. This will
|
574
|
-
* normally consist of the <html> element and any comment nodes found.
|
575
|
-
* Pointers are owned.
|
576
|
-
*/
|
577
|
-
GumboVector /* GumboNode* */ children;
|
578
|
-
|
579
|
-
// True if there was an explicit doctype token as opposed to it being omitted.
|
580
|
-
bool has_doctype;
|
581
|
-
|
582
|
-
// Fields from the doctype token, copied verbatim.
|
583
|
-
const char* name;
|
584
|
-
const char* public_identifier;
|
585
|
-
const char* system_identifier;
|
586
|
-
|
587
|
-
/**
|
588
|
-
* Whether or not the document is in QuirksMode, as determined by the values
|
589
|
-
* in the GumboTokenDocType template.
|
590
|
-
*/
|
591
|
-
GumboQuirksModeEnum doc_type_quirks_mode;
|
592
|
-
} GumboDocument;
|
593
|
-
|
594
|
-
/**
|
595
|
-
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
|
596
|
-
* This contains just a block of text and its position.
|
597
|
-
*/
|
598
|
-
typedef struct _GumboText {
|
599
|
-
/**
|
600
|
-
* The text of this node, after entities have been parsed and decoded. For
|
601
|
-
* comment/cdata nodes, this does not include the comment delimiters.
|
602
|
-
*/
|
603
|
-
const char* text;
|
604
|
-
|
605
|
-
/**
|
606
|
-
* The original text of this node, as a pointer into the original buffer. For
|
607
|
-
* comment/cdata nodes, this includes the comment delimiters.
|
608
|
-
*/
|
609
|
-
GumboStringPiece original_text;
|
610
|
-
|
611
|
-
/**
|
612
|
-
* The starting position of this node. This corresponds to the position of
|
613
|
-
* original_text, before entities are decoded.
|
614
|
-
* */
|
615
|
-
GumboSourcePosition start_pos;
|
616
|
-
} GumboText;
|
617
|
-
|
618
|
-
/**
|
619
|
-
* The struct used to represent all HTML elements. This contains information
|
620
|
-
* about the tag, attributes, and child nodes.
|
621
|
-
*/
|
622
|
-
typedef struct _GumboElement {
|
623
|
-
/**
|
624
|
-
* An array of GumboNodes, containing the children of this element. Pointers
|
625
|
-
* are owned.
|
626
|
-
*/
|
627
|
-
GumboVector /* GumboNode* */ children;
|
628
|
-
|
629
|
-
/** The GumboTag enum for this element. */
|
630
|
-
GumboTag tag;
|
631
|
-
|
632
|
-
/** The GumboNamespaceEnum for this element. */
|
633
|
-
GumboNamespaceEnum tag_namespace;
|
634
|
-
|
635
|
-
/**
|
636
|
-
* A GumboStringPiece pointing to the original tag text for this element,
|
637
|
-
* pointing directly into the source buffer. If the tag was inserted
|
638
|
-
* algorithmically (for example, <head> or <tbody> insertion), this will be a
|
639
|
-
* zero-length string.
|
640
|
-
*/
|
641
|
-
GumboStringPiece original_tag;
|
642
|
-
|
643
|
-
/**
|
644
|
-
* A GumboStringPiece pointing to the original end tag text for this element.
|
645
|
-
* If the end tag was inserted algorithmically, (for example, closing a
|
646
|
-
* self-closing tag), this will be a zero-length string.
|
647
|
-
*/
|
648
|
-
GumboStringPiece original_end_tag;
|
649
|
-
|
650
|
-
/** The source position for the start of the start tag. */
|
651
|
-
GumboSourcePosition start_pos;
|
652
|
-
|
653
|
-
/** The source position for the start of the end tag. */
|
654
|
-
GumboSourcePosition end_pos;
|
655
|
-
|
656
|
-
/**
|
657
|
-
* An array of GumboAttributes, containing the attributes for this tag in the
|
658
|
-
* order that they were parsed. Pointers are owned.
|
659
|
-
*/
|
660
|
-
GumboVector /* GumboAttribute* */ attributes;
|
661
|
-
} GumboElement;
|
662
|
-
|
663
|
-
/**
|
664
|
-
* A supertype for GumboElement and GumboText, so that we can include one
|
665
|
-
* generic type in lists of children and cast as necessary to subtypes.
|
666
|
-
*/
|
667
|
-
struct _GumboNode {
|
668
|
-
/** The type of node that this is. */
|
669
|
-
GumboNodeType type;
|
670
|
-
|
671
|
-
/** Pointer back to parent node. Not owned. */
|
672
|
-
GumboNode* parent;
|
673
|
-
|
674
|
-
/** The index within the parent's children vector of this node. */
|
675
|
-
size_t index_within_parent;
|
676
|
-
|
677
|
-
/**
|
678
|
-
* A bitvector of flags containing information about why this element was
|
679
|
-
* inserted into the parse tree, including a variety of special parse
|
680
|
-
* situations.
|
681
|
-
*/
|
682
|
-
GumboParseFlags parse_flags;
|
683
|
-
|
684
|
-
/** The actual node data. */
|
685
|
-
union {
|
686
|
-
GumboDocument document; // For GUMBO_NODE_DOCUMENT.
|
687
|
-
GumboElement element; // For GUMBO_NODE_ELEMENT.
|
688
|
-
GumboText text; // For everything else.
|
689
|
-
} v;
|
690
|
-
};
|
691
|
-
|
692
|
-
/**
|
693
|
-
* The type for an allocator function. Takes the 'userdata' member of the
|
694
|
-
* GumboParser struct as its first argument. Semantics should be the same as
|
695
|
-
* malloc, i.e. return a block of size_t bytes on success or NULL on failure.
|
696
|
-
* Allocating a block of 0 bytes behaves as per malloc.
|
697
|
-
*/
|
698
|
-
// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
|
699
|
-
typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
|
700
|
-
|
701
|
-
/**
|
702
|
-
* The type for a deallocator function. Takes the 'userdata' member of the
|
703
|
-
* GumboParser struct as its first argument.
|
704
|
-
*/
|
705
|
-
typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
|
706
|
-
|
707
|
-
/**
|
708
|
-
* Input struct containing configuration options for the parser.
|
709
|
-
* These let you specify alternate memory managers, provide different error
|
710
|
-
* handling, etc.
|
711
|
-
* Use kGumboDefaultOptions for sensible defaults, and only set what you need.
|
712
|
-
*/
|
713
|
-
typedef struct _GumboOptions {
|
714
|
-
/** A memory allocator function. Default: malloc. */
|
715
|
-
GumboAllocatorFunction allocator;
|
716
|
-
|
717
|
-
/** A memory deallocator function. Default: free. */
|
718
|
-
GumboDeallocatorFunction deallocator;
|
719
|
-
|
720
|
-
/**
|
721
|
-
* An opaque object that's passed in as the first argument to all callbacks
|
722
|
-
* used by this library. Default: NULL.
|
723
|
-
*/
|
724
|
-
void* userdata;
|
725
|
-
|
726
|
-
/**
|
727
|
-
* The tab-stop size, for computing positions in source code that uses tabs.
|
728
|
-
* Default: 8.
|
729
|
-
*/
|
730
|
-
int tab_stop;
|
731
|
-
|
732
|
-
/**
|
733
|
-
* Whether or not to stop parsing when the first error is encountered.
|
734
|
-
* Default: false.
|
735
|
-
*/
|
736
|
-
bool stop_on_first_error;
|
737
|
-
|
738
|
-
/**
|
739
|
-
* The maximum number of errors before the parser stops recording them. This
|
740
|
-
* is provided so that if the page is totally borked, we don't completely fill
|
741
|
-
* up the errors vector and exhaust memory with useless redundant errors. Set
|
742
|
-
* to -1 to disable the limit.
|
743
|
-
* Default: -1
|
744
|
-
*/
|
745
|
-
int max_errors;
|
746
|
-
} GumboOptions;
|
747
|
-
|
748
|
-
/** Default options struct; use this with gumbo_parse_with_options. */
|
749
|
-
extern const GumboOptions kGumboDefaultOptions;
|
750
|
-
|
751
|
-
/** The output struct containing the results of the parse. */
|
752
|
-
typedef struct _GumboOutput {
|
753
|
-
/**
|
754
|
-
* Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
|
755
|
-
* that contains the entire document as its child.
|
756
|
-
*/
|
757
|
-
GumboNode* document;
|
758
|
-
|
759
|
-
/**
|
760
|
-
* Pointer to the root node. This the <html> tag that forms the root of the
|
761
|
-
* document.
|
762
|
-
*/
|
763
|
-
GumboNode* root;
|
764
|
-
|
765
|
-
/**
|
766
|
-
* A list of errors that occurred during the parse.
|
767
|
-
* NOTE: In version 1.0 of this library, the API for errors hasn't been fully
|
768
|
-
* fleshed out and may change in the future. For this reason, the GumboError
|
769
|
-
* header isn't part of the public API. Contact us if you need errors
|
770
|
-
* reported so we can work out something appropriate for your use-case.
|
771
|
-
*/
|
772
|
-
GumboVector /* GumboError */ errors;
|
773
|
-
} GumboOutput;
|
774
|
-
|
775
|
-
/**
|
776
|
-
* Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
|
777
|
-
* live at least as long as the parse tree, as some fields (eg. original_text)
|
778
|
-
* point directly into the original buffer.
|
779
|
-
*
|
780
|
-
* This doesn't support buffers longer than 4 gigabytes.
|
781
|
-
*/
|
782
|
-
struct _GumboOutput* gumbo_parse(const char* buffer);
|
783
|
-
|
784
|
-
/**
|
785
|
-
* Extended version of gumbo_parse that takes an explicit options structure,
|
786
|
-
* buffer, and length.
|
787
|
-
*/
|
788
|
-
struct _GumboOutput* gumbo_parse_with_options(
|
789
|
-
const GumboOptions* options, const char* buffer, size_t buffer_length);
|
790
|
-
|
791
|
-
/** Release the memory used for the parse tree & parse errors. */
|
792
|
-
void gumbo_destroy_output(
|
793
|
-
const struct _GumboOptions* options, GumboOutput* output);
|
794
|
-
|
795
|
-
|
796
|
-
#ifdef __cplusplus
|
797
|
-
}
|
798
|
-
#endif
|
799
|
-
|
800
|
-
#endif // GUMBO_GUMBO_H_
|