nokogumbo 0.9 → 0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/README.md +15 -10
- data/ext/nokogumboc/extconf.rb +34 -0
- data/ext/nokogumboc/nokogumbo.c +192 -0
- data/gumbo-parser/src/attribute.c +3 -3
- data/gumbo-parser/src/attribute.h +2 -2
- data/gumbo-parser/src/char_ref.c +8 -8
- data/gumbo-parser/src/char_ref.h +3 -3
- data/gumbo-parser/src/error.h +13 -13
- data/gumbo-parser/src/gumbo.h +21 -22
- data/gumbo-parser/src/insertion_mode.h +1 -1
- data/gumbo-parser/src/parser.c +1 -1
- data/gumbo-parser/src/parser.h +9 -9
- data/gumbo-parser/src/string_buffer.c +8 -8
- data/gumbo-parser/src/string_buffer.h +10 -11
- data/gumbo-parser/src/string_piece.c +2 -2
- data/gumbo-parser/src/string_piece.h +2 -2
- data/gumbo-parser/src/token_type.h +1 -1
- data/gumbo-parser/src/tokenizer.c +2 -2
- data/gumbo-parser/src/tokenizer.h +12 -12
- data/gumbo-parser/src/tokenizer_states.h +1 -1
- data/gumbo-parser/src/utf8.h +7 -7
- data/gumbo-parser/src/util.h +5 -4
- data/gumbo-parser/src/vector.c +11 -9
- data/gumbo-parser/src/vector.h +11 -8
- metadata +27 -31
- data/work/extconf.rb +0 -21
- data/work/nokogumbo.c +0 -100
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MDQzNzc1N2Q4MDg5Y2Q3OTgyMTg1MGExOGViNmIxOGIzODcyYzgwOQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YTYyNmMzZTEyNjFjODYyMjY3MTRmNmU1YWIwZjMyZTZiN2ZkMGU4Yg==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NjMzYjg0YmU2NzZkZjUyN2YxZDZjOTM3ZWM5MDkxMjk0OWE5YjA3M2NjY2Y3
|
10
|
+
NDgwZTgxN2U0NWIyNzBkMmNlN2E4NjA1ODhmNGNiNTBhYzE3YWJkOTRiNjg3
|
11
|
+
YmMyZDVkYzIxZjljM2FjNDBhMmFiZTZmOWI1ODgzNmQwNzM4ZGY=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
NjgzNTljOWNmMjY1MDBkZDc4ZmYzZGZlZDg2MDMxNzY0YTM2OWI3Y2Q1NDdl
|
14
|
+
M2FhNWE1M2U5YTQ0MjdlNTc0ZWM1OTg3NTkxMzdlNGU3MDdkNzBhZmNhZTUy
|
15
|
+
OTkzYWE0MjBiZTYyNmEyMGJiNDc2MTE4YWFiYWZkM2YwZDRlNTk=
|
data/README.md
CHANGED
@@ -34,13 +34,16 @@ Notes
|
|
34
34
|
|
35
35
|
* The `Nokogiri::HTML5.parse` function takes a string and passes it to the
|
36
36
|
<code>gumbo_parse_with_options</code> method, using the default options.
|
37
|
-
The resulting Gumbo parse tree is the walked
|
38
|
-
[libxml2](http://xmlsoft.org/html/)
|
39
|
-
|
40
|
-
|
41
|
-
object is constructed to wrap the xmlDoc
|
42
|
-
Ruby objects as necessary, so all
|
43
|
-
libxml2 libraries.
|
37
|
+
The resulting Gumbo parse tree is the walked.
|
38
|
+
* If the necessary Nokogiri and [libxml2](http://xmlsoft.org/html/) headers
|
39
|
+
can be found at installation time then an
|
40
|
+
[xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc) tree is produced
|
41
|
+
and a single Nokogiri Ruby object is constructed to wrap the xmlDoc
|
42
|
+
structure. Nokogiri only produces Ruby objects as necessary, so all
|
43
|
+
searching is done using the underlying libxml2 libraries.
|
44
|
+
* If the necessary headers are not present at installation time, then
|
45
|
+
Nokogiri Ruby objects are created for each Gumbo node. Other than
|
46
|
+
memory usage and CPU time, the results should be equivalent.
|
44
47
|
|
45
48
|
* The `Nokogiri::HTML5.get` function takes care of following redirects,
|
46
49
|
https, and determining the character encoding of the result, based on the
|
@@ -57,9 +60,11 @@ parser will be downloaded and compiled into the Gem itself.
|
|
57
60
|
Installation
|
58
61
|
============
|
59
62
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
+
git clone --recursive https://github.com/rubys/nokogumbo.git
|
64
|
+
cd nokogumbo
|
65
|
+
bundle install
|
66
|
+
rake gem
|
67
|
+
gem install pkg/nokogumbo*.gem
|
63
68
|
|
64
69
|
Related efforts
|
65
70
|
============
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
$CFLAGS += " -std=c99"
|
3
|
+
|
4
|
+
if have_library('xml2', 'xmlNewDoc')
|
5
|
+
# libxml2 libraries from http://www.xmlsoft.org/
|
6
|
+
pkg_config('libxml-2.0')
|
7
|
+
|
8
|
+
# nokogiri configuration from gem install
|
9
|
+
nokogiri_lib = Gem.find_files('nokogiri').
|
10
|
+
sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last
|
11
|
+
if nokogiri_lib
|
12
|
+
nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri')
|
13
|
+
|
14
|
+
# if that doesn't work, try workarounds found in Nokogiri's extconf
|
15
|
+
unless find_header('nokogiri.h', nokogiri_ext)
|
16
|
+
require "#{nokogiri_ext}/extconf.rb"
|
17
|
+
end
|
18
|
+
|
19
|
+
# if found, enable direct calls to Nokogiri (and libxml2)
|
20
|
+
$CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# add in gumbo-parser source from github if not already installed
|
25
|
+
unless have_library('gumbo', 'gumbo_parse')
|
26
|
+
rakehome = ENV['RAKEHOME'] || File.expand_path('../..')
|
27
|
+
unless File.exist? "#{rakehome}/ext/nokogumboc/gumbo.h"
|
28
|
+
require 'fileutils'
|
29
|
+
FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/*"],
|
30
|
+
"#{rakehome}/ext/nokogumboc"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
create_makefile('nokogumboc')
|
@@ -0,0 +1,192 @@
|
|
1
|
+
//
|
2
|
+
// nokogumbo.c defines the following:
|
3
|
+
//
|
4
|
+
// class Nokogumbo
|
5
|
+
// def parse(utf8_string) # returns Nokogiri::HTML::Document
|
6
|
+
// end
|
7
|
+
//
|
8
|
+
// Processing starts by calling gumbo_parse_with_options. The resulting
|
9
|
+
// document tree is then walked:
|
10
|
+
//
|
11
|
+
// * if Nokogiri and libxml2 headers are available at compile time,
|
12
|
+
// (ifdef NGLIB) then a parallel libxml2 tree is constructed, and the
|
13
|
+
// final document is then wrapped using Nokogiri_wrap_xml_document.
|
14
|
+
// This approach reduces memory and CPU requirements as Ruby objects
|
15
|
+
// are only built when necessary.
|
16
|
+
//
|
17
|
+
// * if the necessary headers are not available at compile time, Nokogiri
|
18
|
+
// methods are called instead, producing the equivalent functionality.
|
19
|
+
//
|
20
|
+
|
21
|
+
#include <ruby.h>
|
22
|
+
#include <gumbo.h>
|
23
|
+
|
24
|
+
// class constants
|
25
|
+
static VALUE Document;
|
26
|
+
|
27
|
+
#ifdef NGLIB
|
28
|
+
#include <nokogiri.h>
|
29
|
+
#include <libxml/tree.h>
|
30
|
+
|
31
|
+
#define NIL NULL
|
32
|
+
#define CONST_CAST (xmlChar const*)
|
33
|
+
#else
|
34
|
+
#define NIL 0
|
35
|
+
#define CONST_CAST
|
36
|
+
|
37
|
+
// more class constants
|
38
|
+
static VALUE Element;
|
39
|
+
static VALUE Text;
|
40
|
+
static VALUE CDATA;
|
41
|
+
static VALUE Comment;
|
42
|
+
|
43
|
+
// interned symbols
|
44
|
+
static VALUE new;
|
45
|
+
static VALUE set_attribute;
|
46
|
+
static VALUE add_child;
|
47
|
+
static VALUE internal_subset;
|
48
|
+
static VALUE remove_;
|
49
|
+
static VALUE create_internal_subset;
|
50
|
+
|
51
|
+
// map libxml2 types to Ruby VALUE
|
52
|
+
#define xmlNodePtr VALUE
|
53
|
+
#define xmlDocPtr VALUE
|
54
|
+
|
55
|
+
// redefine libxml2 API as Ruby function calls
|
56
|
+
#define xmlNewDocNode(doc, ns, name, content) \
|
57
|
+
rb_funcall(Element, new, 2, rb_str_new2(name), doc)
|
58
|
+
#define xmlNewProp(element, name, value) \
|
59
|
+
rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value))
|
60
|
+
#define xmlNewDocText(doc, text) \
|
61
|
+
rb_funcall(Text, new, 2, rb_str_new2(text), doc)
|
62
|
+
#define xmlNewCDataBlock(doc, content, length) \
|
63
|
+
rb_funcall(CDATA, new, 2, rb_str_new(content, length), doc)
|
64
|
+
#define xmlNewDocComment(doc, text) \
|
65
|
+
rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
|
66
|
+
#define xmlAddChild(element, node) \
|
67
|
+
rb_funcall(element, add_child, 1, node)
|
68
|
+
#define xmlDocSetRootElement(doc, root) \
|
69
|
+
rb_funcall(doc, add_child, 1, root)
|
70
|
+
#define xmlCreateIntSubset(doc, name, external, system) \
|
71
|
+
rb_funcall(doc, create_internal_subset, 3, rb_str_new2(name), \
|
72
|
+
(external ? rb_str_new2(external) : Qnil), \
|
73
|
+
(system ? rb_str_new2(system) : Qnil));
|
74
|
+
#define Nokogiri_wrap_xml_document(klass, doc) \
|
75
|
+
doc
|
76
|
+
|
77
|
+
// remove internal subset from newly created documents
|
78
|
+
static VALUE xmlNewDoc(char* version) {
|
79
|
+
VALUE doc = rb_funcall(Document, new, 0);
|
80
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
|
81
|
+
return doc;
|
82
|
+
}
|
83
|
+
#endif
|
84
|
+
|
85
|
+
// Build a Nokogiri Element for a given GumboElement (recursively)
|
86
|
+
static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
|
87
|
+
// determine tag name for a given node
|
88
|
+
xmlNodePtr element;
|
89
|
+
if (node->tag != GUMBO_TAG_UNKNOWN) {
|
90
|
+
element = xmlNewDocNode(document, NIL,
|
91
|
+
CONST_CAST gumbo_normalized_tagname(node->tag), NIL);
|
92
|
+
} else {
|
93
|
+
GumboStringPiece tag = node->original_tag;
|
94
|
+
gumbo_tag_from_original_text(&tag);
|
95
|
+
char name[tag.length+1];
|
96
|
+
strncpy(name, tag.data, tag.length);
|
97
|
+
name[tag.length] = '\0';
|
98
|
+
element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL);
|
99
|
+
}
|
100
|
+
|
101
|
+
// add in the attributes
|
102
|
+
GumboVector* attrs = &node->attributes;
|
103
|
+
for (int i=0; i < attrs->length; i++) {
|
104
|
+
GumboAttribute *attr = attrs->data[i];
|
105
|
+
xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
|
106
|
+
}
|
107
|
+
|
108
|
+
// add in the children
|
109
|
+
GumboVector* children = &node->children;
|
110
|
+
for (int i=0; i < children->length; i++) {
|
111
|
+
GumboNode* child = children->data[i];
|
112
|
+
|
113
|
+
xmlNodePtr node = NIL;
|
114
|
+
|
115
|
+
switch (child->type) {
|
116
|
+
case GUMBO_NODE_ELEMENT:
|
117
|
+
node = walk_tree(document, &child->v.element);
|
118
|
+
break;
|
119
|
+
case GUMBO_NODE_WHITESPACE:
|
120
|
+
case GUMBO_NODE_TEXT:
|
121
|
+
node = xmlNewDocText(document, CONST_CAST child->v.text.text);
|
122
|
+
break;
|
123
|
+
case GUMBO_NODE_CDATA:
|
124
|
+
node = xmlNewCDataBlock(document,
|
125
|
+
CONST_CAST child->v.text.original_text.data,
|
126
|
+
(int) child->v.text.original_text.length);
|
127
|
+
break;
|
128
|
+
case GUMBO_NODE_COMMENT:
|
129
|
+
node = xmlNewDocComment(document, CONST_CAST child->v.text.text);
|
130
|
+
break;
|
131
|
+
case GUMBO_NODE_DOCUMENT:
|
132
|
+
break; // should never happen -- ignore
|
133
|
+
}
|
134
|
+
|
135
|
+
if (node) xmlAddChild(element, node);
|
136
|
+
}
|
137
|
+
|
138
|
+
return element;
|
139
|
+
}
|
140
|
+
|
141
|
+
// Parse a string using gumbo_parse into a Nokogiri document
|
142
|
+
static VALUE parse(VALUE self, VALUE string) {
|
143
|
+
GumboOutput *output = gumbo_parse_with_options(
|
144
|
+
&kGumboDefaultOptions, RSTRING_PTR(string),
|
145
|
+
(size_t) RSTRING_LEN(string)
|
146
|
+
);
|
147
|
+
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
|
148
|
+
xmlNodePtr root = walk_tree(doc, &output->root->v.element);
|
149
|
+
xmlDocSetRootElement(doc, root);
|
150
|
+
if (output->document->v.document.has_doctype) {
|
151
|
+
const char *public = output->document->v.document.public_identifier;
|
152
|
+
const char *system = output->document->v.document.system_identifier;
|
153
|
+
xmlCreateIntSubset(doc, CONST_CAST "html",
|
154
|
+
(strlen(public) ? CONST_CAST public : NIL),
|
155
|
+
(strlen(system) ? CONST_CAST system : NIL));
|
156
|
+
}
|
157
|
+
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
158
|
+
|
159
|
+
return Nokogiri_wrap_xml_document(Document, doc);
|
160
|
+
}
|
161
|
+
|
162
|
+
// Initialize the Nokogumbo class and fetch constants we will use later
|
163
|
+
void Init_nokogumboc() {
|
164
|
+
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
165
|
+
rb_require("nokogiri");
|
166
|
+
|
167
|
+
// class constants
|
168
|
+
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
169
|
+
VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
170
|
+
Document = rb_const_get(HTML, rb_intern("Document"));
|
171
|
+
|
172
|
+
#ifndef NGLIB
|
173
|
+
// more class constants
|
174
|
+
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
175
|
+
Element = rb_const_get(XML, rb_intern("Element"));
|
176
|
+
Text = rb_const_get(XML, rb_intern("Text"));
|
177
|
+
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
178
|
+
Comment = rb_const_get(XML, rb_intern("Comment"));
|
179
|
+
|
180
|
+
// interned symbols
|
181
|
+
new = rb_intern("new");
|
182
|
+
set_attribute = rb_intern("set_attribute");
|
183
|
+
add_child = rb_intern("add_child");
|
184
|
+
internal_subset = rb_intern("internal_subset");
|
185
|
+
remove_ = rb_intern("remove");
|
186
|
+
create_internal_subset = rb_intern("create_internal_subset");
|
187
|
+
#endif
|
188
|
+
|
189
|
+
// define Nokogumbo class with a singleton parse method
|
190
|
+
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
191
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 1);
|
192
|
+
}
|
@@ -23,10 +23,10 @@
|
|
23
23
|
|
24
24
|
#include "util.h"
|
25
25
|
|
26
|
-
struct
|
26
|
+
struct GumboInternalParser;
|
27
27
|
|
28
28
|
GumboAttribute* gumbo_get_attribute(
|
29
|
-
const
|
29
|
+
const GumboVector* attributes, const char* name) {
|
30
30
|
for (int i = 0; i < attributes->length; ++i) {
|
31
31
|
GumboAttribute* attr = attributes->data[i];
|
32
32
|
if (!strcasecmp(attr->name, name)) {
|
@@ -37,7 +37,7 @@ GumboAttribute* gumbo_get_attribute(
|
|
37
37
|
}
|
38
38
|
|
39
39
|
void gumbo_destroy_attribute(
|
40
|
-
struct
|
40
|
+
struct GumboInternalParser* parser, GumboAttribute* attribute) {
|
41
41
|
gumbo_parser_deallocate(parser, (void*) attribute->name);
|
42
42
|
gumbo_parser_deallocate(parser, (void*) attribute->value);
|
43
43
|
gumbo_parser_deallocate(parser, (void*) attribute);
|
@@ -23,12 +23,12 @@
|
|
23
23
|
extern "C" {
|
24
24
|
#endif
|
25
25
|
|
26
|
-
struct
|
26
|
+
struct GumboInternalParser;
|
27
27
|
|
28
28
|
// Release the memory used for an GumboAttribute, including the attribute
|
29
29
|
// itself.
|
30
30
|
void gumbo_destroy_attribute(
|
31
|
-
struct
|
31
|
+
struct GumboInternalParser* parser, GumboAttribute* attribute);
|
32
32
|
|
33
33
|
#ifdef __cplusplus
|
34
34
|
}
|
data/gumbo-parser/src/char_ref.c
CHANGED
@@ -26,7 +26,7 @@
|
|
26
26
|
#include "utf8.h"
|
27
27
|
#include "util.h"
|
28
28
|
|
29
|
-
struct
|
29
|
+
struct GumboInternalParser;
|
30
30
|
|
31
31
|
const int kGumboNoChar = -1;
|
32
32
|
|
@@ -2351,7 +2351,7 @@ static int parse_digit(int c, bool allow_hex) {
|
|
2351
2351
|
}
|
2352
2352
|
|
2353
2353
|
static void add_no_digit_error(
|
2354
|
-
struct
|
2354
|
+
struct GumboInternalParser* parser, Utf8Iterator* input) {
|
2355
2355
|
GumboError* error = gumbo_add_error(parser);
|
2356
2356
|
if (!error) {
|
2357
2357
|
return;
|
@@ -2361,7 +2361,7 @@ static void add_no_digit_error(
|
|
2361
2361
|
}
|
2362
2362
|
|
2363
2363
|
static void add_codepoint_error(
|
2364
|
-
struct
|
2364
|
+
struct GumboInternalParser* parser, Utf8Iterator* input,
|
2365
2365
|
GumboErrorType type, int codepoint) {
|
2366
2366
|
GumboError* error = gumbo_add_error(parser);
|
2367
2367
|
if (!error) {
|
@@ -2373,7 +2373,7 @@ static void add_codepoint_error(
|
|
2373
2373
|
}
|
2374
2374
|
|
2375
2375
|
static void add_named_reference_error(
|
2376
|
-
struct
|
2376
|
+
struct GumboInternalParser* parser, Utf8Iterator* input,
|
2377
2377
|
GumboErrorType type, GumboStringPiece text) {
|
2378
2378
|
GumboError* error = gumbo_add_error(parser);
|
2379
2379
|
if (!error) {
|
@@ -2394,7 +2394,7 @@ static int maybe_replace_codepoint(int codepoint) {
|
|
2394
2394
|
}
|
2395
2395
|
|
2396
2396
|
static bool consume_numeric_ref(
|
2397
|
-
struct
|
2397
|
+
struct GumboInternalParser* parser, Utf8Iterator* input, int* output) {
|
2398
2398
|
utf8iterator_next(input);
|
2399
2399
|
bool is_hex = false;
|
2400
2400
|
int c = utf8iterator_current(input);
|
@@ -2475,7 +2475,7 @@ static bool is_legal_attribute_char_next(Utf8Iterator* input) {
|
|
2475
2475
|
}
|
2476
2476
|
|
2477
2477
|
static bool maybe_add_invalid_named_reference(
|
2478
|
-
struct
|
2478
|
+
struct GumboInternalParser* parser, Utf8Iterator* input) {
|
2479
2479
|
// The iterator will always be reset in this code path, so we don't need to
|
2480
2480
|
// worry about consuming characters.
|
2481
2481
|
const char* start = utf8iterator_get_char_pointer(input);
|
@@ -2498,7 +2498,7 @@ static bool maybe_add_invalid_named_reference(
|
|
2498
2498
|
}
|
2499
2499
|
|
2500
2500
|
static bool consume_named_ref(
|
2501
|
-
struct
|
2501
|
+
struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
|
2502
2502
|
OneOrTwoCodepoints* output) {
|
2503
2503
|
assert(output->first == kGumboNoChar);
|
2504
2504
|
const NamedCharRef* char_ref = find_named_char_ref(input);
|
@@ -2530,7 +2530,7 @@ static bool consume_named_ref(
|
|
2530
2530
|
}
|
2531
2531
|
|
2532
2532
|
bool consume_char_ref(
|
2533
|
-
struct
|
2533
|
+
struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
|
2534
2534
|
int additional_allowed_char, bool is_in_attribute,
|
2535
2535
|
OneOrTwoCodepoints* output) {
|
2536
2536
|
utf8iterator_mark(input);
|
data/gumbo-parser/src/char_ref.h
CHANGED
@@ -27,8 +27,8 @@
|
|
27
27
|
extern "C" {
|
28
28
|
#endif
|
29
29
|
|
30
|
-
struct
|
31
|
-
struct
|
30
|
+
struct GumboInternalParser;
|
31
|
+
struct GumboInternalUtf8Iterator;
|
32
32
|
|
33
33
|
// Value that indicates no character was produced.
|
34
34
|
extern const int kGumboNoChar;
|
@@ -50,7 +50,7 @@ typedef struct {
|
|
50
50
|
// space for the "additional allowed char" when the spec says "with no
|
51
51
|
// additional allowed char". Returns false on parse error, true otherwise.
|
52
52
|
bool consume_char_ref(
|
53
|
-
struct
|
53
|
+
struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
|
54
54
|
int additional_allowed_char, bool is_in_attribute,
|
55
55
|
OneOrTwoCodepoints* output);
|
56
56
|
|
data/gumbo-parser/src/error.h
CHANGED
@@ -30,7 +30,7 @@
|
|
30
30
|
extern "C" {
|
31
31
|
#endif
|
32
32
|
|
33
|
-
struct
|
33
|
+
struct GumboInternalParser;
|
34
34
|
|
35
35
|
typedef enum {
|
36
36
|
GUMBO_ERR_UTF8_INVALID,
|
@@ -78,7 +78,7 @@ typedef enum {
|
|
78
78
|
} GumboErrorType;
|
79
79
|
|
80
80
|
// Additional data for duplicated attributes.
|
81
|
-
typedef struct
|
81
|
+
typedef struct GumboInternalDuplicateAttrError {
|
82
82
|
// The name of the attribute. Owned by this struct.
|
83
83
|
const char* name;
|
84
84
|
|
@@ -114,7 +114,7 @@ typedef enum {
|
|
114
114
|
// Additional data for tokenizer errors.
|
115
115
|
// This records the current state and codepoint encountered - this is usually
|
116
116
|
// enough to reconstruct what went wrong and provide a friendly error message.
|
117
|
-
typedef struct
|
117
|
+
typedef struct GumboInternalTokenizerError {
|
118
118
|
// The bad codepoint encountered.
|
119
119
|
int codepoint;
|
120
120
|
|
@@ -123,7 +123,7 @@ typedef struct _GumboTokenizerError {
|
|
123
123
|
} GumboTokenizerError;
|
124
124
|
|
125
125
|
// Additional data for parse errors.
|
126
|
-
typedef struct
|
126
|
+
typedef struct GumboInternalParserError {
|
127
127
|
// The type of input token that resulted in this error.
|
128
128
|
GumboTokenType input_type;
|
129
129
|
|
@@ -142,7 +142,7 @@ typedef struct _GumboParserError {
|
|
142
142
|
// The overall error struct representing an error in decoding/tokenizing/parsing
|
143
143
|
// the HTML. This contains an enumerated type flag, a source position, and then
|
144
144
|
// a union of fields containing data specific to the error.
|
145
|
-
typedef struct
|
145
|
+
typedef struct GumboInternalError {
|
146
146
|
// The type of error.
|
147
147
|
GumboErrorType type;
|
148
148
|
|
@@ -176,23 +176,23 @@ typedef struct _GumboError {
|
|
176
176
|
|
177
177
|
// Parser state, for GUMBO_ERR_PARSER and
|
178
178
|
// GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
|
179
|
-
struct
|
179
|
+
struct GumboInternalParserError parser;
|
180
180
|
} v;
|
181
181
|
} GumboError;
|
182
182
|
|
183
183
|
// Adds a new error to the parser's error list, and returns a pointer to it so
|
184
184
|
// that clients can fill out the rest of its fields. May return NULL if we're
|
185
185
|
// already over the max_errors field specified in GumboOptions.
|
186
|
-
GumboError* gumbo_add_error(struct
|
186
|
+
GumboError* gumbo_add_error(struct GumboInternalParser* parser);
|
187
187
|
|
188
188
|
// Initializes the errors vector in the parser.
|
189
|
-
void gumbo_init_errors(struct
|
189
|
+
void gumbo_init_errors(struct GumboInternalParser* errors);
|
190
190
|
|
191
191
|
// Frees all the errors in the 'errors_' field of the parser.
|
192
|
-
void gumbo_destroy_errors(struct
|
192
|
+
void gumbo_destroy_errors(struct GumboInternalParser* errors);
|
193
193
|
|
194
194
|
// Frees the memory used for a single GumboError.
|
195
|
-
void gumbo_error_destroy(struct
|
195
|
+
void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
|
196
196
|
|
197
197
|
// Prints an error to a string. This fills an empty GumboStringBuffer with a
|
198
198
|
// freshly-allocated buffer containing the error message text. The caller is
|
@@ -200,7 +200,7 @@ void gumbo_error_destroy(struct _GumboParser* parser, GumboError* error);
|
|
200
200
|
// the allocator specified in the GumboParser config and hence should be freed
|
201
201
|
// by gumbo_parser_deallocate().)
|
202
202
|
void gumbo_error_to_string(
|
203
|
-
struct
|
203
|
+
struct GumboInternalParser* parser, const GumboError* error,
|
204
204
|
GumboStringBuffer* output);
|
205
205
|
|
206
206
|
// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
|
@@ -209,13 +209,13 @@ void gumbo_error_to_string(
|
|
209
209
|
// allocated with the allocator specified in the GumboParser config and hence
|
210
210
|
// should be freed by gumbo_parser_deallocate().)
|
211
211
|
void gumbo_caret_diagnostic_to_string(
|
212
|
-
struct
|
212
|
+
struct GumboInternalParser* parser, const GumboError* error,
|
213
213
|
const char* source_text, GumboStringBuffer* output);
|
214
214
|
|
215
215
|
// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
|
216
216
|
// of writing to a string.
|
217
217
|
void gumbo_print_caret_diagnostic(
|
218
|
-
struct
|
218
|
+
struct GumboInternalParser* parser, const GumboError* error,
|
219
219
|
const char* source_text);
|
220
220
|
|
221
221
|
#ifdef __cplusplus
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -59,7 +59,7 @@ extern "C" {
|
|
59
59
|
* buffer of bytes), while the column field is often used to reference a
|
60
60
|
* particular column on a printable display, which nowadays is usually UTF-8.
|
61
61
|
*/
|
62
|
-
typedef struct
|
62
|
+
typedef struct {
|
63
63
|
unsigned int line;
|
64
64
|
unsigned int column;
|
65
65
|
unsigned int offset;
|
@@ -81,7 +81,7 @@ extern const GumboSourcePosition kGumboEmptySourcePosition;
|
|
81
81
|
* Clients should assume that it is not NUL-terminated, and should always use
|
82
82
|
* explicit lengths when manipulating them.
|
83
83
|
*/
|
84
|
-
typedef struct
|
84
|
+
typedef struct {
|
85
85
|
/** A pointer to the beginning of the string. NULL iff length == 0. */
|
86
86
|
const char* data;
|
87
87
|
|
@@ -116,7 +116,7 @@ bool gumbo_string_equals_ignore_case(
|
|
116
116
|
* library. Iteration can be done through inspecting the structure directly in
|
117
117
|
* a for-loop.
|
118
118
|
*/
|
119
|
-
typedef struct
|
119
|
+
typedef struct {
|
120
120
|
/** Data elements. This points to a dynamically-allocated array of capacity
|
121
121
|
* elements, each a void* to the element itself.
|
122
122
|
*/
|
@@ -151,7 +151,7 @@ int gumbo_vector_index_of(GumboVector* vector, void* element);
|
|
151
151
|
* efficiency benefits, by letting the parser work with enums instead of
|
152
152
|
* strings.
|
153
153
|
*/
|
154
|
-
typedef enum
|
154
|
+
typedef enum {
|
155
155
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
|
156
156
|
GUMBO_TAG_HTML,
|
157
157
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
|
@@ -365,7 +365,7 @@ GumboTag gumbo_tag_enum(const char* tagname);
|
|
365
365
|
* HTML includes special handling for XLink, XML, and XMLNS namespaces on
|
366
366
|
* attributes. Everything else goes in the generatic "NONE" namespace.
|
367
367
|
*/
|
368
|
-
typedef enum
|
368
|
+
typedef enum {
|
369
369
|
GUMBO_ATTR_NAMESPACE_NONE,
|
370
370
|
GUMBO_ATTR_NAMESPACE_XLINK,
|
371
371
|
GUMBO_ATTR_NAMESPACE_XML,
|
@@ -377,7 +377,7 @@ typedef enum _GumboAttributeNamespaceEnum {
|
|
377
377
|
* name-value pair, but also includes information about source locations and
|
378
378
|
* original source text.
|
379
379
|
*/
|
380
|
-
typedef struct
|
380
|
+
typedef struct {
|
381
381
|
/**
|
382
382
|
* The namespace for the attribute. This will usually be
|
383
383
|
* GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
|
@@ -438,14 +438,13 @@ typedef struct _GumboAttribute {
|
|
438
438
|
* and return it, or NULL if no such attribute exists. This uses a
|
439
439
|
* case-insensitive match, as HTML is case-insensitive.
|
440
440
|
*/
|
441
|
-
GumboAttribute* gumbo_get_attribute(
|
442
|
-
const struct _GumboVector* attrs, const char* name);
|
441
|
+
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
|
443
442
|
|
444
443
|
/**
|
445
444
|
* Enum denoting the type of node. This determines the type of the node.v
|
446
445
|
* union.
|
447
446
|
*/
|
448
|
-
typedef enum
|
447
|
+
typedef enum {
|
449
448
|
/** Document node. v will be a GumboDocument. */
|
450
449
|
GUMBO_NODE_DOCUMENT,
|
451
450
|
/** Element node. v will be a GumboElement. */
|
@@ -464,10 +463,10 @@ typedef enum _GumboNodeType {
|
|
464
463
|
* Forward declaration of GumboNode so it can be used recursively in
|
465
464
|
* GumboNode.parent.
|
466
465
|
*/
|
467
|
-
typedef struct
|
466
|
+
typedef struct GumboInternalNode GumboNode;
|
468
467
|
|
469
468
|
/** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
|
470
|
-
typedef enum
|
469
|
+
typedef enum {
|
471
470
|
GUMBO_DOCTYPE_NO_QUIRKS,
|
472
471
|
GUMBO_DOCTYPE_QUIRKS,
|
473
472
|
GUMBO_DOCTYPE_LIMITED_QUIRKS
|
@@ -480,7 +479,7 @@ typedef enum _GumboQuirksModeEnum {
|
|
480
479
|
* <math> tag is in the MathML namespace, and anything else is inside the HTML
|
481
480
|
* namespace. No other namespaces are supported, so this can be an enum only.
|
482
481
|
*/
|
483
|
-
typedef enum
|
482
|
+
typedef enum {
|
484
483
|
GUMBO_NAMESPACE_HTML,
|
485
484
|
GUMBO_NAMESPACE_SVG,
|
486
485
|
GUMBO_NAMESPACE_MATHML
|
@@ -494,7 +493,7 @@ typedef enum _GumboNamespaceEnum {
|
|
494
493
|
* may not be allowed by a style guide, or track the prevalence of incorrect or
|
495
494
|
* tricky HTML code.
|
496
495
|
*/
|
497
|
-
typedef enum
|
496
|
+
typedef enum {
|
498
497
|
/**
|
499
498
|
* A normal node - both start and end tags appear in the source, nothing has
|
500
499
|
* been reparented.
|
@@ -568,7 +567,7 @@ typedef enum _GumboParseFlags {
|
|
568
567
|
/**
|
569
568
|
* Information specific to document nodes.
|
570
569
|
*/
|
571
|
-
typedef struct
|
570
|
+
typedef struct {
|
572
571
|
/**
|
573
572
|
* An array of GumboNodes, containing the children of this element. This will
|
574
573
|
* normally consist of the <html> element and any comment nodes found.
|
@@ -595,7 +594,7 @@ typedef struct _GumboDocument {
|
|
595
594
|
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
|
596
595
|
* This contains just a block of text and its position.
|
597
596
|
*/
|
598
|
-
typedef struct
|
597
|
+
typedef struct {
|
599
598
|
/**
|
600
599
|
* The text of this node, after entities have been parsed and decoded. For
|
601
600
|
* comment/cdata nodes, this does not include the comment delimiters.
|
@@ -619,7 +618,7 @@ typedef struct _GumboText {
|
|
619
618
|
* The struct used to represent all HTML elements. This contains information
|
620
619
|
* about the tag, attributes, and child nodes.
|
621
620
|
*/
|
622
|
-
typedef struct
|
621
|
+
typedef struct {
|
623
622
|
/**
|
624
623
|
* An array of GumboNodes, containing the children of this element. Pointers
|
625
624
|
* are owned.
|
@@ -664,7 +663,7 @@ typedef struct _GumboElement {
|
|
664
663
|
* A supertype for GumboElement and GumboText, so that we can include one
|
665
664
|
* generic type in lists of children and cast as necessary to subtypes.
|
666
665
|
*/
|
667
|
-
struct
|
666
|
+
struct GumboInternalNode {
|
668
667
|
/** The type of node that this is. */
|
669
668
|
GumboNodeType type;
|
670
669
|
|
@@ -710,7 +709,7 @@ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
|
|
710
709
|
* handling, etc.
|
711
710
|
* Use kGumboDefaultOptions for sensible defaults, and only set what you need.
|
712
711
|
*/
|
713
|
-
typedef struct
|
712
|
+
typedef struct GumboInternalOptions {
|
714
713
|
/** A memory allocator function. Default: malloc. */
|
715
714
|
GumboAllocatorFunction allocator;
|
716
715
|
|
@@ -749,7 +748,7 @@ typedef struct _GumboOptions {
|
|
749
748
|
extern const GumboOptions kGumboDefaultOptions;
|
750
749
|
|
751
750
|
/** The output struct containing the results of the parse. */
|
752
|
-
typedef struct
|
751
|
+
typedef struct GumboInternalOutput {
|
753
752
|
/**
|
754
753
|
* Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
|
755
754
|
* that contains the entire document as its child.
|
@@ -779,18 +778,18 @@ typedef struct _GumboOutput {
|
|
779
778
|
*
|
780
779
|
* This doesn't support buffers longer than 4 gigabytes.
|
781
780
|
*/
|
782
|
-
|
781
|
+
GumboOutput* gumbo_parse(const char* buffer);
|
783
782
|
|
784
783
|
/**
|
785
784
|
* Extended version of gumbo_parse that takes an explicit options structure,
|
786
785
|
* buffer, and length.
|
787
786
|
*/
|
788
|
-
|
787
|
+
GumboOutput* gumbo_parse_with_options(
|
789
788
|
const GumboOptions* options, const char* buffer, size_t buffer_length);
|
790
789
|
|
791
790
|
/** Release the memory used for the parse tree & parse errors. */
|
792
791
|
void gumbo_destroy_output(
|
793
|
-
const
|
792
|
+
const GumboOptions* options, GumboOutput* output);
|
794
793
|
|
795
794
|
|
796
795
|
#ifdef __cplusplus
|