ruby-gumbo 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.mkd +28 -31
- data/Rakefile +60 -59
- data/ext/extconf.rb +17 -9
- data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
- data/lib/gumbo.rb +19 -0
- data/lib/gumbo/element.rb +52 -0
- data/lib/gumbo/{extra.rb → node.rb} +19 -22
- data/lib/gumbo/text.rb +29 -0
- data/vendor/gumbo-parser/src/attribute.c +44 -0
- data/vendor/gumbo-parser/src/attribute.h +37 -0
- data/vendor/gumbo-parser/src/char_ref.c +2561 -0
- data/vendor/gumbo-parser/src/char_ref.h +61 -0
- data/vendor/gumbo-parser/src/error.c +258 -0
- data/vendor/gumbo-parser/src/error.h +227 -0
- data/vendor/gumbo-parser/src/gumbo.h +807 -0
- data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
- data/vendor/gumbo-parser/src/parser.c +3917 -0
- data/vendor/gumbo-parser/src/parser.h +57 -0
- data/vendor/gumbo-parser/src/string_buffer.c +106 -0
- data/vendor/gumbo-parser/src/string_buffer.h +81 -0
- data/vendor/gumbo-parser/src/string_piece.c +49 -0
- data/vendor/gumbo-parser/src/string_piece.h +39 -0
- data/vendor/gumbo-parser/src/tag.c +225 -0
- data/vendor/gumbo-parser/src/token_type.h +40 -0
- data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
- data/vendor/gumbo-parser/src/tokenizer.h +123 -0
- data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
- data/vendor/gumbo-parser/src/utf8.c +275 -0
- data/vendor/gumbo-parser/src/utf8.h +127 -0
- data/vendor/gumbo-parser/src/util.c +58 -0
- data/vendor/gumbo-parser/src/util.h +62 -0
- data/vendor/gumbo-parser/src/vector.c +123 -0
- data/vendor/gumbo-parser/src/vector.h +69 -0
- metadata +40 -10
- data/ext/extconf.h +0 -3
@@ -0,0 +1,807 @@
|
|
1
|
+
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
//
|
17
|
+
// We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
|
18
|
+
// GUMBO_ as a prefix for enum constants (static constants get the Google-style
|
19
|
+
// kGumbo prefix).
|
20
|
+
|
21
|
+
/**
|
22
|
+
* @file
|
23
|
+
* @mainpage Gumbo HTML Parser
|
24
|
+
*
|
25
|
+
* This provides a conformant, no-dependencies implementation of the HTML5
|
26
|
+
* parsing algorithm. It supports only UTF8; if you need to parse a different
|
27
|
+
* encoding, run a preprocessing step to convert to UTF8. It returns a parse
|
28
|
+
* tree made of the structs in this file.
|
29
|
+
*
|
30
|
+
* Example:
|
31
|
+
* @code
|
32
|
+
* GumboOutput* output = gumbo_parse(input);
|
33
|
+
* do_something_with_doctype(output->document);
|
34
|
+
* do_something_with_html_tree(output->root);
|
35
|
+
* gumbo_destroy_output(&options, output);
|
36
|
+
* @endcode
|
37
|
+
* HTML5 Spec:
|
38
|
+
*
|
39
|
+
* http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
|
40
|
+
*/
|
41
|
+
|
42
|
+
#ifndef GUMBO_GUMBO_H_
|
43
|
+
#define GUMBO_GUMBO_H_
|
44
|
+
|
45
|
+
#ifdef _MSC_VER
|
46
|
+
#define _CRT_SECURE_NO_WARNINGS
|
47
|
+
#define fileno _fileno
|
48
|
+
#endif
|
49
|
+
|
50
|
+
#include <stdbool.h>
|
51
|
+
#include <stddef.h>
|
52
|
+
|
53
|
+
#ifdef __cplusplus
|
54
|
+
extern "C" {
|
55
|
+
#endif
|
56
|
+
|
57
|
+
/**
|
58
|
+
* A struct representing a character position within the original text buffer.
|
59
|
+
* Line and column numbers are 1-based and offsets are 0-based, which matches
|
60
|
+
* how most editors and command-line tools work. Also, columns measure
|
61
|
+
* positions in terms of characters while offsets measure by bytes; this is
|
62
|
+
* because the offset field is often used to pull out a particular region of
|
63
|
+
* text (which in most languages that bind to C implies pointer arithmetic on a
|
64
|
+
* buffer of bytes), while the column field is often used to reference a
|
65
|
+
* particular column on a printable display, which nowadays is usually UTF-8.
|
66
|
+
*/
|
67
|
+
typedef struct {
|
68
|
+
unsigned int line;
|
69
|
+
unsigned int column;
|
70
|
+
unsigned int offset;
|
71
|
+
} GumboSourcePosition;
|
72
|
+
|
73
|
+
/**
|
74
|
+
* A SourcePosition used for elements that have no source position, i.e.
|
75
|
+
* parser-inserted elements.
|
76
|
+
*/
|
77
|
+
extern const GumboSourcePosition kGumboEmptySourcePosition;
|
78
|
+
|
79
|
+
|
80
|
+
/**
|
81
|
+
* A struct representing a string or part of a string. Strings within the
|
82
|
+
* parser are represented by a char* and a length; the char* points into
|
83
|
+
* an existing data buffer owned by some other code (often the original input).
|
84
|
+
* GumboStringPieces are assumed (by convention) to be immutable, because they
|
85
|
+
* may share data. Use GumboStringBuffer if you need to construct a string.
|
86
|
+
* Clients should assume that it is not NUL-terminated, and should always use
|
87
|
+
* explicit lengths when manipulating them.
|
88
|
+
*/
|
89
|
+
typedef struct {
|
90
|
+
/** A pointer to the beginning of the string. NULL iff length == 0. */
|
91
|
+
const char* data;
|
92
|
+
|
93
|
+
/** The length of the string fragment, in bytes. May be zero. */
|
94
|
+
size_t length;
|
95
|
+
} GumboStringPiece;
|
96
|
+
|
97
|
+
/** A constant to represent a 0-length null string. */
|
98
|
+
extern const GumboStringPiece kGumboEmptyString;
|
99
|
+
|
100
|
+
/**
|
101
|
+
* Compares two GumboStringPieces, and returns true if they're equal or false
|
102
|
+
* otherwise.
|
103
|
+
*/
|
104
|
+
bool gumbo_string_equals(
|
105
|
+
const GumboStringPiece* str1, const GumboStringPiece* str2);
|
106
|
+
|
107
|
+
/**
|
108
|
+
* Compares two GumboStringPieces ignoring case, and returns true if they're
|
109
|
+
* equal or false otherwise.
|
110
|
+
*/
|
111
|
+
bool gumbo_string_equals_ignore_case(
|
112
|
+
const GumboStringPiece* str1, const GumboStringPiece* str2);
|
113
|
+
|
114
|
+
|
115
|
+
/**
|
116
|
+
* A simple vector implementation. This stores a pointer to a data array and a
|
117
|
+
* length. All elements are stored as void*; client code must cast to the
|
118
|
+
* appropriate type. Overflows upon addition result in reallocation of the data
|
119
|
+
* array, with the size doubling to maintain O(1) amortized cost. There is no
|
120
|
+
* removal function, as this isn't needed for any of the operations within this
|
121
|
+
* library. Iteration can be done through inspecting the structure directly in
|
122
|
+
* a for-loop.
|
123
|
+
*/
|
124
|
+
typedef struct {
|
125
|
+
/** Data elements. This points to a dynamically-allocated array of capacity
|
126
|
+
* elements, each a void* to the element itself.
|
127
|
+
*/
|
128
|
+
void** data;
|
129
|
+
|
130
|
+
/** Number of elements currently in the vector. */
|
131
|
+
unsigned int length;
|
132
|
+
|
133
|
+
/** Current array capacity. */
|
134
|
+
unsigned int capacity;
|
135
|
+
} GumboVector;
|
136
|
+
|
137
|
+
/** An empty (0-length, 0-capacity) GumboVector. */
|
138
|
+
extern const GumboVector kGumboEmptyVector;
|
139
|
+
|
140
|
+
/**
|
141
|
+
* Returns the first index at which an element appears in this vector (testing
|
142
|
+
* by pointer equality), or -1 if it never does.
|
143
|
+
*/
|
144
|
+
int gumbo_vector_index_of(GumboVector* vector, void* element);
|
145
|
+
|
146
|
+
|
147
|
+
/**
|
148
|
+
* An enum for all the tags defined in the HTML5 standard. These correspond to
|
149
|
+
* the tag names themselves. Enum constants exist only for tags which appear in
|
150
|
+
* the spec itself (or for tags with special handling in the SVG and MathML
|
151
|
+
* namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag
|
152
|
+
* name can be obtained through original_tag.
|
153
|
+
*
|
154
|
+
* This is mostly for API convenience, so that clients of this library don't
|
155
|
+
* need to perform a strcasecmp to find the normalized tag name. It also has
|
156
|
+
* efficiency benefits, by letting the parser work with enums instead of
|
157
|
+
* strings.
|
158
|
+
*/
|
159
|
+
typedef enum {
|
160
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
|
161
|
+
GUMBO_TAG_HTML,
|
162
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
|
163
|
+
GUMBO_TAG_HEAD,
|
164
|
+
GUMBO_TAG_TITLE,
|
165
|
+
GUMBO_TAG_BASE,
|
166
|
+
GUMBO_TAG_LINK,
|
167
|
+
GUMBO_TAG_META,
|
168
|
+
GUMBO_TAG_STYLE,
|
169
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
|
170
|
+
GUMBO_TAG_SCRIPT,
|
171
|
+
GUMBO_TAG_NOSCRIPT,
|
172
|
+
GUMBO_TAG_TEMPLATE,
|
173
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
|
174
|
+
GUMBO_TAG_BODY,
|
175
|
+
GUMBO_TAG_ARTICLE,
|
176
|
+
GUMBO_TAG_SECTION,
|
177
|
+
GUMBO_TAG_NAV,
|
178
|
+
GUMBO_TAG_ASIDE,
|
179
|
+
GUMBO_TAG_H1,
|
180
|
+
GUMBO_TAG_H2,
|
181
|
+
GUMBO_TAG_H3,
|
182
|
+
GUMBO_TAG_H4,
|
183
|
+
GUMBO_TAG_H5,
|
184
|
+
GUMBO_TAG_H6,
|
185
|
+
GUMBO_TAG_HGROUP,
|
186
|
+
GUMBO_TAG_HEADER,
|
187
|
+
GUMBO_TAG_FOOTER,
|
188
|
+
GUMBO_TAG_ADDRESS,
|
189
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/grouping-content.html#grouping-content
|
190
|
+
GUMBO_TAG_P,
|
191
|
+
GUMBO_TAG_HR,
|
192
|
+
GUMBO_TAG_PRE,
|
193
|
+
GUMBO_TAG_BLOCKQUOTE,
|
194
|
+
GUMBO_TAG_OL,
|
195
|
+
GUMBO_TAG_UL,
|
196
|
+
GUMBO_TAG_LI,
|
197
|
+
GUMBO_TAG_DL,
|
198
|
+
GUMBO_TAG_DT,
|
199
|
+
GUMBO_TAG_DD,
|
200
|
+
GUMBO_TAG_FIGURE,
|
201
|
+
GUMBO_TAG_FIGCAPTION,
|
202
|
+
GUMBO_TAG_MAIN,
|
203
|
+
GUMBO_TAG_DIV,
|
204
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
|
205
|
+
GUMBO_TAG_A,
|
206
|
+
GUMBO_TAG_EM,
|
207
|
+
GUMBO_TAG_STRONG,
|
208
|
+
GUMBO_TAG_SMALL,
|
209
|
+
GUMBO_TAG_S,
|
210
|
+
GUMBO_TAG_CITE,
|
211
|
+
GUMBO_TAG_Q,
|
212
|
+
GUMBO_TAG_DFN,
|
213
|
+
GUMBO_TAG_ABBR,
|
214
|
+
GUMBO_TAG_DATA,
|
215
|
+
GUMBO_TAG_TIME,
|
216
|
+
GUMBO_TAG_CODE,
|
217
|
+
GUMBO_TAG_VAR,
|
218
|
+
GUMBO_TAG_SAMP,
|
219
|
+
GUMBO_TAG_KBD,
|
220
|
+
GUMBO_TAG_SUB,
|
221
|
+
GUMBO_TAG_SUP,
|
222
|
+
GUMBO_TAG_I,
|
223
|
+
GUMBO_TAG_B,
|
224
|
+
GUMBO_TAG_U,
|
225
|
+
GUMBO_TAG_MARK,
|
226
|
+
GUMBO_TAG_RUBY,
|
227
|
+
GUMBO_TAG_RT,
|
228
|
+
GUMBO_TAG_RP,
|
229
|
+
GUMBO_TAG_BDI,
|
230
|
+
GUMBO_TAG_BDO,
|
231
|
+
GUMBO_TAG_SPAN,
|
232
|
+
GUMBO_TAG_BR,
|
233
|
+
GUMBO_TAG_WBR,
|
234
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/edits.html#edits
|
235
|
+
GUMBO_TAG_INS,
|
236
|
+
GUMBO_TAG_DEL,
|
237
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/embedded-content-1.html#embedded-content-1
|
238
|
+
GUMBO_TAG_IMAGE,
|
239
|
+
GUMBO_TAG_IMG,
|
240
|
+
GUMBO_TAG_IFRAME,
|
241
|
+
GUMBO_TAG_EMBED,
|
242
|
+
GUMBO_TAG_OBJECT,
|
243
|
+
GUMBO_TAG_PARAM,
|
244
|
+
GUMBO_TAG_VIDEO,
|
245
|
+
GUMBO_TAG_AUDIO,
|
246
|
+
GUMBO_TAG_SOURCE,
|
247
|
+
GUMBO_TAG_TRACK,
|
248
|
+
GUMBO_TAG_CANVAS,
|
249
|
+
GUMBO_TAG_MAP,
|
250
|
+
GUMBO_TAG_AREA,
|
251
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#mathml
|
252
|
+
GUMBO_TAG_MATH,
|
253
|
+
GUMBO_TAG_MI,
|
254
|
+
GUMBO_TAG_MO,
|
255
|
+
GUMBO_TAG_MN,
|
256
|
+
GUMBO_TAG_MS,
|
257
|
+
GUMBO_TAG_MTEXT,
|
258
|
+
GUMBO_TAG_MGLYPH,
|
259
|
+
GUMBO_TAG_MALIGNMARK,
|
260
|
+
GUMBO_TAG_ANNOTATION_XML,
|
261
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-map-element.html#svg-0
|
262
|
+
GUMBO_TAG_SVG,
|
263
|
+
GUMBO_TAG_FOREIGNOBJECT,
|
264
|
+
GUMBO_TAG_DESC,
|
265
|
+
// SVG title tags will have GUMBO_TAG_TITLE as with HTML.
|
266
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tabular-data.html#tabular-data
|
267
|
+
GUMBO_TAG_TABLE,
|
268
|
+
GUMBO_TAG_CAPTION,
|
269
|
+
GUMBO_TAG_COLGROUP,
|
270
|
+
GUMBO_TAG_COL,
|
271
|
+
GUMBO_TAG_TBODY,
|
272
|
+
GUMBO_TAG_THEAD,
|
273
|
+
GUMBO_TAG_TFOOT,
|
274
|
+
GUMBO_TAG_TR,
|
275
|
+
GUMBO_TAG_TD,
|
276
|
+
GUMBO_TAG_TH,
|
277
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/forms.html#forms
|
278
|
+
GUMBO_TAG_FORM,
|
279
|
+
GUMBO_TAG_FIELDSET,
|
280
|
+
GUMBO_TAG_LEGEND,
|
281
|
+
GUMBO_TAG_LABEL,
|
282
|
+
GUMBO_TAG_INPUT,
|
283
|
+
GUMBO_TAG_BUTTON,
|
284
|
+
GUMBO_TAG_SELECT,
|
285
|
+
GUMBO_TAG_DATALIST,
|
286
|
+
GUMBO_TAG_OPTGROUP,
|
287
|
+
GUMBO_TAG_OPTION,
|
288
|
+
GUMBO_TAG_TEXTAREA,
|
289
|
+
GUMBO_TAG_KEYGEN,
|
290
|
+
GUMBO_TAG_OUTPUT,
|
291
|
+
GUMBO_TAG_PROGRESS,
|
292
|
+
GUMBO_TAG_METER,
|
293
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
|
294
|
+
GUMBO_TAG_DETAILS,
|
295
|
+
GUMBO_TAG_SUMMARY,
|
296
|
+
GUMBO_TAG_MENU,
|
297
|
+
GUMBO_TAG_MENUITEM,
|
298
|
+
// Non-conforming elements that nonetheless appear in the HTML5 spec.
|
299
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
|
300
|
+
GUMBO_TAG_APPLET,
|
301
|
+
GUMBO_TAG_ACRONYM,
|
302
|
+
GUMBO_TAG_BGSOUND,
|
303
|
+
GUMBO_TAG_DIR,
|
304
|
+
GUMBO_TAG_FRAME,
|
305
|
+
GUMBO_TAG_FRAMESET,
|
306
|
+
GUMBO_TAG_NOFRAMES,
|
307
|
+
GUMBO_TAG_ISINDEX,
|
308
|
+
GUMBO_TAG_LISTING,
|
309
|
+
GUMBO_TAG_XMP,
|
310
|
+
GUMBO_TAG_NEXTID,
|
311
|
+
GUMBO_TAG_NOEMBED,
|
312
|
+
GUMBO_TAG_PLAINTEXT,
|
313
|
+
GUMBO_TAG_RB,
|
314
|
+
GUMBO_TAG_STRIKE,
|
315
|
+
GUMBO_TAG_BASEFONT,
|
316
|
+
GUMBO_TAG_BIG,
|
317
|
+
GUMBO_TAG_BLINK,
|
318
|
+
GUMBO_TAG_CENTER,
|
319
|
+
GUMBO_TAG_FONT,
|
320
|
+
GUMBO_TAG_MARQUEE,
|
321
|
+
GUMBO_TAG_MULTICOL,
|
322
|
+
GUMBO_TAG_NOBR,
|
323
|
+
GUMBO_TAG_SPACER,
|
324
|
+
GUMBO_TAG_TT,
|
325
|
+
// Used for all tags that don't have special handling in HTML.
|
326
|
+
GUMBO_TAG_UNKNOWN,
|
327
|
+
// A marker value to indicate the end of the enum, for iterating over it.
|
328
|
+
// Also used as the terminator for varargs functions that take tags.
|
329
|
+
GUMBO_TAG_LAST,
|
330
|
+
} GumboTag;
|
331
|
+
|
332
|
+
/**
|
333
|
+
* Returns the normalized (usually all-lowercased, except for foreign content)
|
334
|
+
* tag name for an GumboTag enum. Return value is static data owned by the
|
335
|
+
* library.
|
336
|
+
*/
|
337
|
+
const char* gumbo_normalized_tagname(GumboTag tag);
|
338
|
+
|
339
|
+
/**
|
340
|
+
* Extracts the tag name from the original_text field of an element or token by
|
341
|
+
* stripping off </> characters and attributes and adjusting the passed-in
|
342
|
+
* GumboStringPiece appropriately. The tag name is in the original case and
|
343
|
+
* shares a buffer with the original text, to simplify memory management.
|
344
|
+
* Behavior is undefined if a string-piece that doesn't represent an HTML tag
|
345
|
+
* (<tagname> or </tagname>) is passed in. If the string piece is completely
|
346
|
+
* empty (NULL data pointer), then this function will exit successfully as a
|
347
|
+
* no-op.
|
348
|
+
*/
|
349
|
+
void gumbo_tag_from_original_text(GumboStringPiece* text);
|
350
|
+
|
351
|
+
/**
|
352
|
+
* Fixes the case of SVG elements that are not all lowercase.
|
353
|
+
* http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign
|
354
|
+
* This is not done at parse time because there's no place to store a mutated
|
355
|
+
* tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags
|
356
|
+
* without special handling), while original_tag_name is a pointer into the
|
357
|
+
* original buffer. Instead, we provide this helper function that clients can
|
358
|
+
* use to rename SVG tags as appropriate.
|
359
|
+
* Returns the case-normalized SVG tagname if a replacement is found, or NULL if
|
360
|
+
* no normalization is called for. The return value is static data and owned by
|
361
|
+
* the library.
|
362
|
+
*/
|
363
|
+
const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
|
364
|
+
|
365
|
+
/**
|
366
|
+
* Converts a tag name string (which may be in upper or mixed case) to a tag
|
367
|
+
* enum.
|
368
|
+
*/
|
369
|
+
GumboTag gumbo_tag_enum(const char* tagname);
|
370
|
+
|
371
|
+
/**
|
372
|
+
* Attribute namespaces.
|
373
|
+
* HTML includes special handling for XLink, XML, and XMLNS namespaces on
|
374
|
+
* attributes. Everything else goes in the generic "NONE" namespace.
|
375
|
+
*/
|
376
|
+
typedef enum {
|
377
|
+
GUMBO_ATTR_NAMESPACE_NONE,
|
378
|
+
GUMBO_ATTR_NAMESPACE_XLINK,
|
379
|
+
GUMBO_ATTR_NAMESPACE_XML,
|
380
|
+
GUMBO_ATTR_NAMESPACE_XMLNS,
|
381
|
+
} GumboAttributeNamespaceEnum;
|
382
|
+
|
383
|
+
/**
|
384
|
+
* A struct representing a single attribute on an HTML tag. This is a
|
385
|
+
* name-value pair, but also includes information about source locations and
|
386
|
+
* original source text.
|
387
|
+
*/
|
388
|
+
typedef struct {
|
389
|
+
/**
|
390
|
+
* The namespace for the attribute. This will usually be
|
391
|
+
* GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
|
392
|
+
* values, per:
|
393
|
+
* http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#adjust-foreign-attributes
|
394
|
+
*/
|
395
|
+
GumboAttributeNamespaceEnum attr_namespace;
|
396
|
+
|
397
|
+
/**
|
398
|
+
* The name of the attribute. This is in a freshly-allocated buffer to deal
|
399
|
+
* with case-normalization, and is null-terminated.
|
400
|
+
*/
|
401
|
+
const char* name;
|
402
|
+
|
403
|
+
/**
|
404
|
+
* The original text of the attribute name, as a pointer into the original
|
405
|
+
* source buffer.
|
406
|
+
*/
|
407
|
+
GumboStringPiece original_name;
|
408
|
+
|
409
|
+
/**
|
410
|
+
* The value of the attribute. This is in a freshly-allocated buffer to deal
|
411
|
+
* with unescaping, and is null-terminated. It does not include any quotes
|
412
|
+
* that surround the attribute. If the attribute has no value (for example,
|
413
|
+
* 'selected' on a checkbox), this will be an empty string.
|
414
|
+
*/
|
415
|
+
const char* value;
|
416
|
+
|
417
|
+
/**
|
418
|
+
* The original text of the value of the attribute. This points into the
|
419
|
+
* original source buffer. It includes any quotes that surround the
|
420
|
+
* attribute, and you can look at original_value.data[0] and
|
421
|
+
* original_value.data[original_value.length - 1] to determine what the quote
|
422
|
+
* characters were. If the attribute has no value, this will be a 0-length
|
423
|
+
* string.
|
424
|
+
*/
|
425
|
+
GumboStringPiece original_value;
|
426
|
+
|
427
|
+
/** The starting position of the attribute name. */
|
428
|
+
GumboSourcePosition name_start;
|
429
|
+
|
430
|
+
/**
|
431
|
+
* The ending position of the attribute name. This is not always derivable
|
432
|
+
* from the starting position of the value because of the possibility of
|
433
|
+
* whitespace around the = sign.
|
434
|
+
*/
|
435
|
+
GumboSourcePosition name_end;
|
436
|
+
|
437
|
+
/** The starting position of the attribute value. */
|
438
|
+
GumboSourcePosition value_start;
|
439
|
+
|
440
|
+
/** The ending position of the attribute value. */
|
441
|
+
GumboSourcePosition value_end;
|
442
|
+
} GumboAttribute;
|
443
|
+
|
444
|
+
/**
|
445
|
+
* Given a vector of GumboAttributes, look up the one with the specified name
|
446
|
+
* and return it, or NULL if no such attribute exists. This uses a
|
447
|
+
* case-insensitive match, as HTML is case-insensitive.
|
448
|
+
*/
|
449
|
+
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
|
450
|
+
|
451
|
+
/**
|
452
|
+
* Enum denoting the type of node. This determines the type of the node.v
|
453
|
+
* union.
|
454
|
+
*/
|
455
|
+
typedef enum {
|
456
|
+
/** Document node. v will be a GumboDocument. */
|
457
|
+
GUMBO_NODE_DOCUMENT,
|
458
|
+
/** Element node. v will be a GumboElement. */
|
459
|
+
GUMBO_NODE_ELEMENT,
|
460
|
+
/** Text node. v will be a GumboText. */
|
461
|
+
GUMBO_NODE_TEXT,
|
462
|
+
/** CDATA node. v will be a GumboText. */
|
463
|
+
GUMBO_NODE_CDATA,
|
464
|
+
/** Comment node. v. will be a GumboText, excluding comment delimiters. */
|
465
|
+
GUMBO_NODE_COMMENT,
|
466
|
+
/** Text node, where all contents is whitespace. v will be a GumboText. */
|
467
|
+
GUMBO_NODE_WHITESPACE
|
468
|
+
} GumboNodeType;
|
469
|
+
|
470
|
+
/**
|
471
|
+
* Forward declaration of GumboNode so it can be used recursively in
|
472
|
+
* GumboNode.parent.
|
473
|
+
*/
|
474
|
+
typedef struct GumboInternalNode GumboNode;
|
475
|
+
|
476
|
+
/** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
|
477
|
+
typedef enum {
|
478
|
+
GUMBO_DOCTYPE_NO_QUIRKS,
|
479
|
+
GUMBO_DOCTYPE_QUIRKS,
|
480
|
+
GUMBO_DOCTYPE_LIMITED_QUIRKS
|
481
|
+
} GumboQuirksModeEnum;
|
482
|
+
|
483
|
+
/**
|
484
|
+
* Namespaces.
|
485
|
+
* Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather,
|
486
|
+
* anything inside an <svg> tag is in the SVG namespace, anything inside the
|
487
|
+
* <math> tag is in the MathML namespace, and anything else is inside the HTML
|
488
|
+
* namespace. No other namespaces are supported, so this can be an enum only.
|
489
|
+
*/
|
490
|
+
typedef enum {
|
491
|
+
GUMBO_NAMESPACE_HTML,
|
492
|
+
GUMBO_NAMESPACE_SVG,
|
493
|
+
GUMBO_NAMESPACE_MATHML
|
494
|
+
} GumboNamespaceEnum;
|
495
|
+
|
496
|
+
/**
|
497
|
+
* Parse flags.
|
498
|
+
* We track the reasons for parser insertion of nodes and store them in a
|
499
|
+
* bitvector in the node itself. This lets client code optimize out nodes that
|
500
|
+
* are implied by the HTML structure of the document, or flag constructs that
|
501
|
+
* may not be allowed by a style guide, or track the prevalence of incorrect or
|
502
|
+
* tricky HTML code.
|
503
|
+
*/
|
504
|
+
typedef enum {
|
505
|
+
/**
|
506
|
+
* A normal node - both start and end tags appear in the source, nothing has
|
507
|
+
* been reparented.
|
508
|
+
*/
|
509
|
+
GUMBO_INSERTION_NORMAL = 0,
|
510
|
+
|
511
|
+
/**
|
512
|
+
* A node inserted by the parser to fulfill some implicit insertion rule.
|
513
|
+
* This is usually set in addition to some other flag giving a more specific
|
514
|
+
* insertion reason; it's a generic catch-all term meaning "The start tag for
|
515
|
+
* this node did not appear in the document source".
|
516
|
+
*/
|
517
|
+
GUMBO_INSERTION_BY_PARSER = 1 << 0,
|
518
|
+
|
519
|
+
/**
|
520
|
+
* A flag indicating that the end tag for this node did not appear in the
|
521
|
+
* document source. Note that in some cases, you can still have
|
522
|
+
* parser-inserted nodes with an explicit end tag: for example, "Text</html>"
|
523
|
+
* has GUMBO_INSERTED_BY_PARSER set on the <html> node, but
|
524
|
+
* GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually
|
525
|
+
* exists. This flag will be set only if the end tag is completely missing;
|
526
|
+
* in some cases, the end tag may be misplaced (eg. a </body> tag with text
|
527
|
+
* afterwards), which will leave this flag unset and require clients to
|
528
|
+
* inspect the parse errors for that case.
|
529
|
+
*/
|
530
|
+
GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
|
531
|
+
|
532
|
+
// Value 1 << 2 was for a flag that has since been removed.
|
533
|
+
|
534
|
+
/**
|
535
|
+
* A flag for nodes that are inserted because their presence is implied by
|
536
|
+
* other tags, eg. <html>, <head>, <body>, <tbody>, etc.
|
537
|
+
*/
|
538
|
+
GUMBO_INSERTION_IMPLIED = 1 << 3,
|
539
|
+
|
540
|
+
/**
|
541
|
+
* A flag for nodes that are converted from their end tag equivalents. For
|
542
|
+
* example, </p> when no paragraph is open implies that the parser should
|
543
|
+
* create a <p> tag and immediately close it, while </br> means the same thing
|
544
|
+
* as <br>.
|
545
|
+
*/
|
546
|
+
GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
|
547
|
+
|
548
|
+
/** A flag for nodes that are converted from the parse of an <isindex> tag. */
|
549
|
+
GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
|
550
|
+
|
551
|
+
/** A flag for <image> tags that are rewritten as <img>. */
|
552
|
+
GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
|
553
|
+
|
554
|
+
/**
|
555
|
+
* A flag for nodes that are cloned as a result of the reconstruction of
|
556
|
+
* active formatting elements. This is set only on the clone; the initial
|
557
|
+
* portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.
|
558
|
+
*/
|
559
|
+
GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
|
560
|
+
|
561
|
+
/** A flag for nodes that are cloned by the adoption agency algorithm. */
|
562
|
+
GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
|
563
|
+
|
564
|
+
/** A flag for nodes that are moved by the adoption agency algorithm. */
|
565
|
+
GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
|
566
|
+
|
567
|
+
/**
|
568
|
+
* A flag for nodes that have been foster-parented out of a table (or
|
569
|
+
* should've been foster-parented, if verbatim mode is set).
|
570
|
+
*/
|
571
|
+
GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
|
572
|
+
} GumboParseFlags;
|
573
|
+
|
574
|
+
|
575
|
+
/**
|
576
|
+
* Information specific to document nodes.
|
577
|
+
*/
|
578
|
+
typedef struct {
|
579
|
+
/**
|
580
|
+
* An array of GumboNodes, containing the children of this element. This will
|
581
|
+
* normally consist of the <html> element and any comment nodes found.
|
582
|
+
* Pointers are owned.
|
583
|
+
*/
|
584
|
+
GumboVector /* GumboNode* */ children;
|
585
|
+
|
586
|
+
// True if there was an explicit doctype token as opposed to it being omitted.
|
587
|
+
bool has_doctype;
|
588
|
+
|
589
|
+
// Fields from the doctype token, copied verbatim.
|
590
|
+
const char* name;
|
591
|
+
const char* public_identifier;
|
592
|
+
const char* system_identifier;
|
593
|
+
|
594
|
+
/**
|
595
|
+
* Whether or not the document is in QuirksMode, as determined by the values
|
596
|
+
* in the GumboTokenDocType template.
|
597
|
+
*/
|
598
|
+
GumboQuirksModeEnum doc_type_quirks_mode;
|
599
|
+
} GumboDocument;
|
600
|
+
|
601
|
+
/**
|
602
|
+
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
|
603
|
+
* This contains just a block of text and its position.
|
604
|
+
*/
|
605
|
+
typedef struct {
|
606
|
+
/**
|
607
|
+
* The text of this node, after entities have been parsed and decoded. For
|
608
|
+
* comment/cdata nodes, this does not include the comment delimiters.
|
609
|
+
*/
|
610
|
+
const char* text;
|
611
|
+
|
612
|
+
/**
|
613
|
+
* The original text of this node, as a pointer into the original buffer. For
|
614
|
+
* comment/cdata nodes, this includes the comment delimiters.
|
615
|
+
*/
|
616
|
+
GumboStringPiece original_text;
|
617
|
+
|
618
|
+
/**
|
619
|
+
* The starting position of this node. This corresponds to the position of
|
620
|
+
* original_text, before entities are decoded.
|
621
|
+
* */
|
622
|
+
GumboSourcePosition start_pos;
|
623
|
+
} GumboText;
|
624
|
+
|
625
|
+
/**
|
626
|
+
* The struct used to represent all HTML elements. This contains information
|
627
|
+
* about the tag, attributes, and child nodes.
|
628
|
+
*/
|
629
|
+
typedef struct {
|
630
|
+
/**
|
631
|
+
* An array of GumboNodes, containing the children of this element. Pointers
|
632
|
+
* are owned.
|
633
|
+
*/
|
634
|
+
GumboVector /* GumboNode* */ children;
|
635
|
+
|
636
|
+
/** The GumboTag enum for this element. */
|
637
|
+
GumboTag tag;
|
638
|
+
|
639
|
+
/** The GumboNamespaceEnum for this element. */
|
640
|
+
GumboNamespaceEnum tag_namespace;
|
641
|
+
|
642
|
+
/**
|
643
|
+
* A GumboStringPiece pointing to the original tag text for this element,
|
644
|
+
* pointing directly into the source buffer. If the tag was inserted
|
645
|
+
* algorithmically (for example, <head> or <tbody> insertion), this will be a
|
646
|
+
* zero-length string.
|
647
|
+
*/
|
648
|
+
GumboStringPiece original_tag;
|
649
|
+
|
650
|
+
/**
|
651
|
+
* A GumboStringPiece pointing to the original end tag text for this element.
|
652
|
+
* If the end tag was inserted algorithmically, (for example, closing a
|
653
|
+
* self-closing tag), this will be a zero-length string.
|
654
|
+
*/
|
655
|
+
GumboStringPiece original_end_tag;
|
656
|
+
|
657
|
+
/** The source position for the start of the start tag. */
|
658
|
+
GumboSourcePosition start_pos;
|
659
|
+
|
660
|
+
/** The source position for the start of the end tag. */
|
661
|
+
GumboSourcePosition end_pos;
|
662
|
+
|
663
|
+
/**
|
664
|
+
* An array of GumboAttributes, containing the attributes for this tag in the
|
665
|
+
* order that they were parsed. Pointers are owned.
|
666
|
+
*/
|
667
|
+
GumboVector /* GumboAttribute* */ attributes;
|
668
|
+
} GumboElement;
|
669
|
+
|
670
|
+
/**
|
671
|
+
* A supertype for GumboElement and GumboText, so that we can include one
|
672
|
+
* generic type in lists of children and cast as necessary to subtypes.
|
673
|
+
*/
|
674
|
+
struct GumboInternalNode {
|
675
|
+
/** The type of node that this is. */
|
676
|
+
GumboNodeType type;
|
677
|
+
|
678
|
+
/** Pointer back to parent node. Not owned. */
|
679
|
+
GumboNode* parent;
|
680
|
+
|
681
|
+
/** The index within the parent's children vector of this node. */
|
682
|
+
size_t index_within_parent;
|
683
|
+
|
684
|
+
/**
|
685
|
+
* A bitvector of flags containing information about why this element was
|
686
|
+
* inserted into the parse tree, including a variety of special parse
|
687
|
+
* situations.
|
688
|
+
*/
|
689
|
+
GumboParseFlags parse_flags;
|
690
|
+
|
691
|
+
/** The actual node data. */
|
692
|
+
union {
|
693
|
+
GumboDocument document; // For GUMBO_NODE_DOCUMENT.
|
694
|
+
GumboElement element; // For GUMBO_NODE_ELEMENT.
|
695
|
+
GumboText text; // For everything else.
|
696
|
+
} v;
|
697
|
+
};
|
698
|
+
|
699
|
+
/**
|
700
|
+
* The type for an allocator function. Takes the 'userdata' member of the
|
701
|
+
* GumboParser struct as its first argument. Semantics should be the same as
|
702
|
+
* malloc, i.e. return a block of size_t bytes on success or NULL on failure.
|
703
|
+
* Allocating a block of 0 bytes behaves as per malloc.
|
704
|
+
*/
|
705
|
+
// TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
|
706
|
+
typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
|
707
|
+
|
708
|
+
/**
|
709
|
+
* The type for a deallocator function. Takes the 'userdata' member of the
|
710
|
+
* GumboParser struct as its first argument.
|
711
|
+
*/
|
712
|
+
typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
|
713
|
+
|
714
|
+
/**
|
715
|
+
* Input struct containing configuration options for the parser.
|
716
|
+
* These let you specify alternate memory managers, provide different error
|
717
|
+
* handling, etc.
|
718
|
+
* Use kGumboDefaultOptions for sensible defaults, and only set what you need.
|
719
|
+
*/
|
720
|
+
typedef struct GumboInternalOptions {
|
721
|
+
/** A memory allocator function. Default: malloc. */
|
722
|
+
GumboAllocatorFunction allocator;
|
723
|
+
|
724
|
+
/** A memory deallocator function. Default: free. */
|
725
|
+
GumboDeallocatorFunction deallocator;
|
726
|
+
|
727
|
+
/**
|
728
|
+
* An opaque object that's passed in as the first argument to all callbacks
|
729
|
+
* used by this library. Default: NULL.
|
730
|
+
*/
|
731
|
+
void* userdata;
|
732
|
+
|
733
|
+
/**
|
734
|
+
* The tab-stop size, for computing positions in source code that uses tabs.
|
735
|
+
* Default: 8.
|
736
|
+
*/
|
737
|
+
int tab_stop;
|
738
|
+
|
739
|
+
/**
|
740
|
+
* Whether or not to stop parsing when the first error is encountered.
|
741
|
+
* Default: false.
|
742
|
+
*/
|
743
|
+
bool stop_on_first_error;
|
744
|
+
|
745
|
+
/**
|
746
|
+
* The maximum number of errors before the parser stops recording them. This
|
747
|
+
* is provided so that if the page is totally borked, we don't completely fill
|
748
|
+
* up the errors vector and exhaust memory with useless redundant errors. Set
|
749
|
+
* to -1 to disable the limit.
|
750
|
+
* Default: -1
|
751
|
+
*/
|
752
|
+
int max_errors;
|
753
|
+
} GumboOptions;
|
754
|
+
|
755
|
+
/** Default options struct; use this with gumbo_parse_with_options. */
|
756
|
+
extern const GumboOptions kGumboDefaultOptions;
|
757
|
+
|
758
|
+
/** The output struct containing the results of the parse. */
|
759
|
+
typedef struct GumboInternalOutput {
|
760
|
+
/**
|
761
|
+
* Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
|
762
|
+
* that contains the entire document as its child.
|
763
|
+
*/
|
764
|
+
GumboNode* document;
|
765
|
+
|
766
|
+
/**
|
767
|
+
* Pointer to the root node. This the <html> tag that forms the root of the
|
768
|
+
* document.
|
769
|
+
*/
|
770
|
+
GumboNode* root;
|
771
|
+
|
772
|
+
/**
|
773
|
+
* A list of errors that occurred during the parse.
|
774
|
+
* NOTE: In version 1.0 of this library, the API for errors hasn't been fully
|
775
|
+
* fleshed out and may change in the future. For this reason, the GumboError
|
776
|
+
* header isn't part of the public API. Contact us if you need errors
|
777
|
+
* reported so we can work out something appropriate for your use-case.
|
778
|
+
*/
|
779
|
+
GumboVector /* GumboError */ errors;
|
780
|
+
} GumboOutput;
|
781
|
+
|
782
|
+
/**
|
783
|
+
* Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must
|
784
|
+
* live at least as long as the parse tree, as some fields (eg. original_text)
|
785
|
+
* point directly into the original buffer.
|
786
|
+
*
|
787
|
+
* This doesn't support buffers longer than 4 gigabytes.
|
788
|
+
*/
|
789
|
+
GumboOutput* gumbo_parse(const char* buffer);
|
790
|
+
|
791
|
+
/**
|
792
|
+
* Extended version of gumbo_parse that takes an explicit options structure,
|
793
|
+
* buffer, and length.
|
794
|
+
*/
|
795
|
+
GumboOutput* gumbo_parse_with_options(
|
796
|
+
const GumboOptions* options, const char* buffer, size_t buffer_length);
|
797
|
+
|
798
|
+
/** Release the memory used for the parse tree & parse errors. */
|
799
|
+
void gumbo_destroy_output(
|
800
|
+
const GumboOptions* options, GumboOutput* output);
|
801
|
+
|
802
|
+
|
803
|
+
#ifdef __cplusplus
|
804
|
+
}
|
805
|
+
#endif
|
806
|
+
|
807
|
+
#endif // GUMBO_GUMBO_H_
|