nokogumbo 0.9 → 0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/README.md +15 -10
- data/ext/nokogumboc/extconf.rb +34 -0
- data/ext/nokogumboc/nokogumbo.c +192 -0
- data/gumbo-parser/src/attribute.c +3 -3
- data/gumbo-parser/src/attribute.h +2 -2
- data/gumbo-parser/src/char_ref.c +8 -8
- data/gumbo-parser/src/char_ref.h +3 -3
- data/gumbo-parser/src/error.h +13 -13
- data/gumbo-parser/src/gumbo.h +21 -22
- data/gumbo-parser/src/insertion_mode.h +1 -1
- data/gumbo-parser/src/parser.c +1 -1
- data/gumbo-parser/src/parser.h +9 -9
- data/gumbo-parser/src/string_buffer.c +8 -8
- data/gumbo-parser/src/string_buffer.h +10 -11
- data/gumbo-parser/src/string_piece.c +2 -2
- data/gumbo-parser/src/string_piece.h +2 -2
- data/gumbo-parser/src/token_type.h +1 -1
- data/gumbo-parser/src/tokenizer.c +2 -2
- data/gumbo-parser/src/tokenizer.h +12 -12
- data/gumbo-parser/src/tokenizer_states.h +1 -1
- data/gumbo-parser/src/utf8.h +7 -7
- data/gumbo-parser/src/util.h +5 -4
- data/gumbo-parser/src/vector.c +11 -9
- data/gumbo-parser/src/vector.h +11 -8
- metadata +27 -31
- data/work/extconf.rb +0 -21
- data/work/nokogumbo.c +0 -100
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MDQzNzc1N2Q4MDg5Y2Q3OTgyMTg1MGExOGViNmIxOGIzODcyYzgwOQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YTYyNmMzZTEyNjFjODYyMjY3MTRmNmU1YWIwZjMyZTZiN2ZkMGU4Yg==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
NjMzYjg0YmU2NzZkZjUyN2YxZDZjOTM3ZWM5MDkxMjk0OWE5YjA3M2NjY2Y3
|
10
|
+
NDgwZTgxN2U0NWIyNzBkMmNlN2E4NjA1ODhmNGNiNTBhYzE3YWJkOTRiNjg3
|
11
|
+
YmMyZDVkYzIxZjljM2FjNDBhMmFiZTZmOWI1ODgzNmQwNzM4ZGY=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
NjgzNTljOWNmMjY1MDBkZDc4ZmYzZGZlZDg2MDMxNzY0YTM2OWI3Y2Q1NDdl
|
14
|
+
M2FhNWE1M2U5YTQ0MjdlNTc0ZWM1OTg3NTkxMzdlNGU3MDdkNzBhZmNhZTUy
|
15
|
+
OTkzYWE0MjBiZTYyNmEyMGJiNDc2MTE4YWFiYWZkM2YwZDRlNTk=
|
data/README.md
CHANGED
@@ -34,13 +34,16 @@ Notes
|
|
34
34
|
|
35
35
|
* The `Nokogiri::HTML5.parse` function takes a string and passes it to the
|
36
36
|
<code>gumbo_parse_with_options</code> method, using the default options.
|
37
|
-
The resulting Gumbo parse tree is the walked
|
38
|
-
[libxml2](http://xmlsoft.org/html/)
|
39
|
-
|
40
|
-
|
41
|
-
object is constructed to wrap the xmlDoc
|
42
|
-
Ruby objects as necessary, so all
|
43
|
-
libxml2 libraries.
|
37
|
+
The resulting Gumbo parse tree is the walked.
|
38
|
+
* If the necessary Nokogiri and [libxml2](http://xmlsoft.org/html/) headers
|
39
|
+
can be found at installation time then an
|
40
|
+
[xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc) tree is produced
|
41
|
+
and a single Nokogiri Ruby object is constructed to wrap the xmlDoc
|
42
|
+
structure. Nokogiri only produces Ruby objects as necessary, so all
|
43
|
+
searching is done using the underlying libxml2 libraries.
|
44
|
+
* If the necessary headers are not present at installation time, then
|
45
|
+
Nokogiri Ruby objects are created for each Gumbo node. Other than
|
46
|
+
memory usage and CPU time, the results should be equivalent.
|
44
47
|
|
45
48
|
* The `Nokogiri::HTML5.get` function takes care of following redirects,
|
46
49
|
https, and determining the character encoding of the result, based on the
|
@@ -57,9 +60,11 @@ parser will be downloaded and compiled into the Gem itself.
|
|
57
60
|
Installation
|
58
61
|
============
|
59
62
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
+
git clone --recursive https://github.com/rubys/nokogumbo.git
|
64
|
+
cd nokogumbo
|
65
|
+
bundle install
|
66
|
+
rake gem
|
67
|
+
gem install pkg/nokogumbo*.gem
|
63
68
|
|
64
69
|
Related efforts
|
65
70
|
============
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
$CFLAGS += " -std=c99"
|
3
|
+
|
4
|
+
if have_library('xml2', 'xmlNewDoc')
|
5
|
+
# libxml2 libraries from http://www.xmlsoft.org/
|
6
|
+
pkg_config('libxml-2.0')
|
7
|
+
|
8
|
+
# nokogiri configuration from gem install
|
9
|
+
nokogiri_lib = Gem.find_files('nokogiri').
|
10
|
+
sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last
|
11
|
+
if nokogiri_lib
|
12
|
+
nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri')
|
13
|
+
|
14
|
+
# if that doesn't work, try workarounds found in Nokogiri's extconf
|
15
|
+
unless find_header('nokogiri.h', nokogiri_ext)
|
16
|
+
require "#{nokogiri_ext}/extconf.rb"
|
17
|
+
end
|
18
|
+
|
19
|
+
# if found, enable direct calls to Nokogiri (and libxml2)
|
20
|
+
$CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# add in gumbo-parser source from github if not already installed
|
25
|
+
unless have_library('gumbo', 'gumbo_parse')
|
26
|
+
rakehome = ENV['RAKEHOME'] || File.expand_path('../..')
|
27
|
+
unless File.exist? "#{rakehome}/ext/nokogumboc/gumbo.h"
|
28
|
+
require 'fileutils'
|
29
|
+
FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/*"],
|
30
|
+
"#{rakehome}/ext/nokogumboc"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
create_makefile('nokogumboc')
|
@@ -0,0 +1,192 @@
|
|
1
|
+
//
|
2
|
+
// nokogumbo.c defines the following:
|
3
|
+
//
|
4
|
+
// class Nokogumbo
|
5
|
+
// def parse(utf8_string) # returns Nokogiri::HTML::Document
|
6
|
+
// end
|
7
|
+
//
|
8
|
+
// Processing starts by calling gumbo_parse_with_options. The resulting
|
9
|
+
// document tree is then walked:
|
10
|
+
//
|
11
|
+
// * if Nokogiri and libxml2 headers are available at compile time,
|
12
|
+
// (ifdef NGLIB) then a parallel libxml2 tree is constructed, and the
|
13
|
+
// final document is then wrapped using Nokogiri_wrap_xml_document.
|
14
|
+
// This approach reduces memory and CPU requirements as Ruby objects
|
15
|
+
// are only built when necessary.
|
16
|
+
//
|
17
|
+
// * if the necessary headers are not available at compile time, Nokogiri
|
18
|
+
// methods are called instead, producing the equivalent functionality.
|
19
|
+
//
|
20
|
+
|
21
|
+
#include <ruby.h>
|
22
|
+
#include <gumbo.h>
|
23
|
+
|
24
|
+
// class constants
|
25
|
+
static VALUE Document;
|
26
|
+
|
27
|
+
#ifdef NGLIB
|
28
|
+
#include <nokogiri.h>
|
29
|
+
#include <libxml/tree.h>
|
30
|
+
|
31
|
+
#define NIL NULL
|
32
|
+
#define CONST_CAST (xmlChar const*)
|
33
|
+
#else
|
34
|
+
#define NIL 0
|
35
|
+
#define CONST_CAST
|
36
|
+
|
37
|
+
// more class constants
|
38
|
+
static VALUE Element;
|
39
|
+
static VALUE Text;
|
40
|
+
static VALUE CDATA;
|
41
|
+
static VALUE Comment;
|
42
|
+
|
43
|
+
// interned symbols
|
44
|
+
static VALUE new;
|
45
|
+
static VALUE set_attribute;
|
46
|
+
static VALUE add_child;
|
47
|
+
static VALUE internal_subset;
|
48
|
+
static VALUE remove_;
|
49
|
+
static VALUE create_internal_subset;
|
50
|
+
|
51
|
+
// map libxml2 types to Ruby VALUE
|
52
|
+
#define xmlNodePtr VALUE
|
53
|
+
#define xmlDocPtr VALUE
|
54
|
+
|
55
|
+
// redefine libxml2 API as Ruby function calls
|
56
|
+
#define xmlNewDocNode(doc, ns, name, content) \
|
57
|
+
rb_funcall(Element, new, 2, rb_str_new2(name), doc)
|
58
|
+
#define xmlNewProp(element, name, value) \
|
59
|
+
rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value))
|
60
|
+
#define xmlNewDocText(doc, text) \
|
61
|
+
rb_funcall(Text, new, 2, rb_str_new2(text), doc)
|
62
|
+
#define xmlNewCDataBlock(doc, content, length) \
|
63
|
+
rb_funcall(CDATA, new, 2, rb_str_new(content, length), doc)
|
64
|
+
#define xmlNewDocComment(doc, text) \
|
65
|
+
rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
|
66
|
+
#define xmlAddChild(element, node) \
|
67
|
+
rb_funcall(element, add_child, 1, node)
|
68
|
+
#define xmlDocSetRootElement(doc, root) \
|
69
|
+
rb_funcall(doc, add_child, 1, root)
|
70
|
+
#define xmlCreateIntSubset(doc, name, external, system) \
|
71
|
+
rb_funcall(doc, create_internal_subset, 3, rb_str_new2(name), \
|
72
|
+
(external ? rb_str_new2(external) : Qnil), \
|
73
|
+
(system ? rb_str_new2(system) : Qnil));
|
74
|
+
#define Nokogiri_wrap_xml_document(klass, doc) \
|
75
|
+
doc
|
76
|
+
|
77
|
+
// remove internal subset from newly created documents
|
78
|
+
static VALUE xmlNewDoc(char* version) {
|
79
|
+
VALUE doc = rb_funcall(Document, new, 0);
|
80
|
+
rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
|
81
|
+
return doc;
|
82
|
+
}
|
83
|
+
#endif
|
84
|
+
|
85
|
+
// Build a Nokogiri Element for a given GumboElement (recursively)
|
86
|
+
static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
|
87
|
+
// determine tag name for a given node
|
88
|
+
xmlNodePtr element;
|
89
|
+
if (node->tag != GUMBO_TAG_UNKNOWN) {
|
90
|
+
element = xmlNewDocNode(document, NIL,
|
91
|
+
CONST_CAST gumbo_normalized_tagname(node->tag), NIL);
|
92
|
+
} else {
|
93
|
+
GumboStringPiece tag = node->original_tag;
|
94
|
+
gumbo_tag_from_original_text(&tag);
|
95
|
+
char name[tag.length+1];
|
96
|
+
strncpy(name, tag.data, tag.length);
|
97
|
+
name[tag.length] = '\0';
|
98
|
+
element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL);
|
99
|
+
}
|
100
|
+
|
101
|
+
// add in the attributes
|
102
|
+
GumboVector* attrs = &node->attributes;
|
103
|
+
for (int i=0; i < attrs->length; i++) {
|
104
|
+
GumboAttribute *attr = attrs->data[i];
|
105
|
+
xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
|
106
|
+
}
|
107
|
+
|
108
|
+
// add in the children
|
109
|
+
GumboVector* children = &node->children;
|
110
|
+
for (int i=0; i < children->length; i++) {
|
111
|
+
GumboNode* child = children->data[i];
|
112
|
+
|
113
|
+
xmlNodePtr node = NIL;
|
114
|
+
|
115
|
+
switch (child->type) {
|
116
|
+
case GUMBO_NODE_ELEMENT:
|
117
|
+
node = walk_tree(document, &child->v.element);
|
118
|
+
break;
|
119
|
+
case GUMBO_NODE_WHITESPACE:
|
120
|
+
case GUMBO_NODE_TEXT:
|
121
|
+
node = xmlNewDocText(document, CONST_CAST child->v.text.text);
|
122
|
+
break;
|
123
|
+
case GUMBO_NODE_CDATA:
|
124
|
+
node = xmlNewCDataBlock(document,
|
125
|
+
CONST_CAST child->v.text.original_text.data,
|
126
|
+
(int) child->v.text.original_text.length);
|
127
|
+
break;
|
128
|
+
case GUMBO_NODE_COMMENT:
|
129
|
+
node = xmlNewDocComment(document, CONST_CAST child->v.text.text);
|
130
|
+
break;
|
131
|
+
case GUMBO_NODE_DOCUMENT:
|
132
|
+
break; // should never happen -- ignore
|
133
|
+
}
|
134
|
+
|
135
|
+
if (node) xmlAddChild(element, node);
|
136
|
+
}
|
137
|
+
|
138
|
+
return element;
|
139
|
+
}
|
140
|
+
|
141
|
+
// Parse a string using gumbo_parse into a Nokogiri document
|
142
|
+
static VALUE parse(VALUE self, VALUE string) {
|
143
|
+
GumboOutput *output = gumbo_parse_with_options(
|
144
|
+
&kGumboDefaultOptions, RSTRING_PTR(string),
|
145
|
+
(size_t) RSTRING_LEN(string)
|
146
|
+
);
|
147
|
+
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
|
148
|
+
xmlNodePtr root = walk_tree(doc, &output->root->v.element);
|
149
|
+
xmlDocSetRootElement(doc, root);
|
150
|
+
if (output->document->v.document.has_doctype) {
|
151
|
+
const char *public = output->document->v.document.public_identifier;
|
152
|
+
const char *system = output->document->v.document.system_identifier;
|
153
|
+
xmlCreateIntSubset(doc, CONST_CAST "html",
|
154
|
+
(strlen(public) ? CONST_CAST public : NIL),
|
155
|
+
(strlen(system) ? CONST_CAST system : NIL));
|
156
|
+
}
|
157
|
+
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
158
|
+
|
159
|
+
return Nokogiri_wrap_xml_document(Document, doc);
|
160
|
+
}
|
161
|
+
|
162
|
+
// Initialize the Nokogumbo class and fetch constants we will use later
|
163
|
+
void Init_nokogumboc() {
|
164
|
+
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
165
|
+
rb_require("nokogiri");
|
166
|
+
|
167
|
+
// class constants
|
168
|
+
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
169
|
+
VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
170
|
+
Document = rb_const_get(HTML, rb_intern("Document"));
|
171
|
+
|
172
|
+
#ifndef NGLIB
|
173
|
+
// more class constants
|
174
|
+
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
175
|
+
Element = rb_const_get(XML, rb_intern("Element"));
|
176
|
+
Text = rb_const_get(XML, rb_intern("Text"));
|
177
|
+
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
178
|
+
Comment = rb_const_get(XML, rb_intern("Comment"));
|
179
|
+
|
180
|
+
// interned symbols
|
181
|
+
new = rb_intern("new");
|
182
|
+
set_attribute = rb_intern("set_attribute");
|
183
|
+
add_child = rb_intern("add_child");
|
184
|
+
internal_subset = rb_intern("internal_subset");
|
185
|
+
remove_ = rb_intern("remove");
|
186
|
+
create_internal_subset = rb_intern("create_internal_subset");
|
187
|
+
#endif
|
188
|
+
|
189
|
+
// define Nokogumbo class with a singleton parse method
|
190
|
+
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
191
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 1);
|
192
|
+
}
|
@@ -23,10 +23,10 @@
|
|
23
23
|
|
24
24
|
#include "util.h"
|
25
25
|
|
26
|
-
struct
|
26
|
+
struct GumboInternalParser;
|
27
27
|
|
28
28
|
GumboAttribute* gumbo_get_attribute(
|
29
|
-
const
|
29
|
+
const GumboVector* attributes, const char* name) {
|
30
30
|
for (int i = 0; i < attributes->length; ++i) {
|
31
31
|
GumboAttribute* attr = attributes->data[i];
|
32
32
|
if (!strcasecmp(attr->name, name)) {
|
@@ -37,7 +37,7 @@ GumboAttribute* gumbo_get_attribute(
|
|
37
37
|
}
|
38
38
|
|
39
39
|
void gumbo_destroy_attribute(
|
40
|
-
struct
|
40
|
+
struct GumboInternalParser* parser, GumboAttribute* attribute) {
|
41
41
|
gumbo_parser_deallocate(parser, (void*) attribute->name);
|
42
42
|
gumbo_parser_deallocate(parser, (void*) attribute->value);
|
43
43
|
gumbo_parser_deallocate(parser, (void*) attribute);
|
@@ -23,12 +23,12 @@
|
|
23
23
|
extern "C" {
|
24
24
|
#endif
|
25
25
|
|
26
|
-
struct
|
26
|
+
struct GumboInternalParser;
|
27
27
|
|
28
28
|
// Release the memory used for an GumboAttribute, including the attribute
|
29
29
|
// itself.
|
30
30
|
void gumbo_destroy_attribute(
|
31
|
-
struct
|
31
|
+
struct GumboInternalParser* parser, GumboAttribute* attribute);
|
32
32
|
|
33
33
|
#ifdef __cplusplus
|
34
34
|
}
|
data/gumbo-parser/src/char_ref.c
CHANGED
@@ -26,7 +26,7 @@
|
|
26
26
|
#include "utf8.h"
|
27
27
|
#include "util.h"
|
28
28
|
|
29
|
-
struct
|
29
|
+
struct GumboInternalParser;
|
30
30
|
|
31
31
|
const int kGumboNoChar = -1;
|
32
32
|
|
@@ -2351,7 +2351,7 @@ static int parse_digit(int c, bool allow_hex) {
|
|
2351
2351
|
}
|
2352
2352
|
|
2353
2353
|
static void add_no_digit_error(
|
2354
|
-
struct
|
2354
|
+
struct GumboInternalParser* parser, Utf8Iterator* input) {
|
2355
2355
|
GumboError* error = gumbo_add_error(parser);
|
2356
2356
|
if (!error) {
|
2357
2357
|
return;
|
@@ -2361,7 +2361,7 @@ static void add_no_digit_error(
|
|
2361
2361
|
}
|
2362
2362
|
|
2363
2363
|
static void add_codepoint_error(
|
2364
|
-
struct
|
2364
|
+
struct GumboInternalParser* parser, Utf8Iterator* input,
|
2365
2365
|
GumboErrorType type, int codepoint) {
|
2366
2366
|
GumboError* error = gumbo_add_error(parser);
|
2367
2367
|
if (!error) {
|
@@ -2373,7 +2373,7 @@ static void add_codepoint_error(
|
|
2373
2373
|
}
|
2374
2374
|
|
2375
2375
|
static void add_named_reference_error(
|
2376
|
-
struct
|
2376
|
+
struct GumboInternalParser* parser, Utf8Iterator* input,
|
2377
2377
|
GumboErrorType type, GumboStringPiece text) {
|
2378
2378
|
GumboError* error = gumbo_add_error(parser);
|
2379
2379
|
if (!error) {
|
@@ -2394,7 +2394,7 @@ static int maybe_replace_codepoint(int codepoint) {
|
|
2394
2394
|
}
|
2395
2395
|
|
2396
2396
|
static bool consume_numeric_ref(
|
2397
|
-
struct
|
2397
|
+
struct GumboInternalParser* parser, Utf8Iterator* input, int* output) {
|
2398
2398
|
utf8iterator_next(input);
|
2399
2399
|
bool is_hex = false;
|
2400
2400
|
int c = utf8iterator_current(input);
|
@@ -2475,7 +2475,7 @@ static bool is_legal_attribute_char_next(Utf8Iterator* input) {
|
|
2475
2475
|
}
|
2476
2476
|
|
2477
2477
|
static bool maybe_add_invalid_named_reference(
|
2478
|
-
struct
|
2478
|
+
struct GumboInternalParser* parser, Utf8Iterator* input) {
|
2479
2479
|
// The iterator will always be reset in this code path, so we don't need to
|
2480
2480
|
// worry about consuming characters.
|
2481
2481
|
const char* start = utf8iterator_get_char_pointer(input);
|
@@ -2498,7 +2498,7 @@ static bool maybe_add_invalid_named_reference(
|
|
2498
2498
|
}
|
2499
2499
|
|
2500
2500
|
static bool consume_named_ref(
|
2501
|
-
struct
|
2501
|
+
struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
|
2502
2502
|
OneOrTwoCodepoints* output) {
|
2503
2503
|
assert(output->first == kGumboNoChar);
|
2504
2504
|
const NamedCharRef* char_ref = find_named_char_ref(input);
|
@@ -2530,7 +2530,7 @@ static bool consume_named_ref(
|
|
2530
2530
|
}
|
2531
2531
|
|
2532
2532
|
bool consume_char_ref(
|
2533
|
-
struct
|
2533
|
+
struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
|
2534
2534
|
int additional_allowed_char, bool is_in_attribute,
|
2535
2535
|
OneOrTwoCodepoints* output) {
|
2536
2536
|
utf8iterator_mark(input);
|
data/gumbo-parser/src/char_ref.h
CHANGED
@@ -27,8 +27,8 @@
|
|
27
27
|
extern "C" {
|
28
28
|
#endif
|
29
29
|
|
30
|
-
struct
|
31
|
-
struct
|
30
|
+
struct GumboInternalParser;
|
31
|
+
struct GumboInternalUtf8Iterator;
|
32
32
|
|
33
33
|
// Value that indicates no character was produced.
|
34
34
|
extern const int kGumboNoChar;
|
@@ -50,7 +50,7 @@ typedef struct {
|
|
50
50
|
// space for the "additional allowed char" when the spec says "with no
|
51
51
|
// additional allowed char". Returns false on parse error, true otherwise.
|
52
52
|
bool consume_char_ref(
|
53
|
-
struct
|
53
|
+
struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
|
54
54
|
int additional_allowed_char, bool is_in_attribute,
|
55
55
|
OneOrTwoCodepoints* output);
|
56
56
|
|
data/gumbo-parser/src/error.h
CHANGED
@@ -30,7 +30,7 @@
|
|
30
30
|
extern "C" {
|
31
31
|
#endif
|
32
32
|
|
33
|
-
struct
|
33
|
+
struct GumboInternalParser;
|
34
34
|
|
35
35
|
typedef enum {
|
36
36
|
GUMBO_ERR_UTF8_INVALID,
|
@@ -78,7 +78,7 @@ typedef enum {
|
|
78
78
|
} GumboErrorType;
|
79
79
|
|
80
80
|
// Additional data for duplicated attributes.
|
81
|
-
typedef struct
|
81
|
+
typedef struct GumboInternalDuplicateAttrError {
|
82
82
|
// The name of the attribute. Owned by this struct.
|
83
83
|
const char* name;
|
84
84
|
|
@@ -114,7 +114,7 @@ typedef enum {
|
|
114
114
|
// Additional data for tokenizer errors.
|
115
115
|
// This records the current state and codepoint encountered - this is usually
|
116
116
|
// enough to reconstruct what went wrong and provide a friendly error message.
|
117
|
-
typedef struct
|
117
|
+
typedef struct GumboInternalTokenizerError {
|
118
118
|
// The bad codepoint encountered.
|
119
119
|
int codepoint;
|
120
120
|
|
@@ -123,7 +123,7 @@ typedef struct _GumboTokenizerError {
|
|
123
123
|
} GumboTokenizerError;
|
124
124
|
|
125
125
|
// Additional data for parse errors.
|
126
|
-
typedef struct
|
126
|
+
typedef struct GumboInternalParserError {
|
127
127
|
// The type of input token that resulted in this error.
|
128
128
|
GumboTokenType input_type;
|
129
129
|
|
@@ -142,7 +142,7 @@ typedef struct _GumboParserError {
|
|
142
142
|
// The overall error struct representing an error in decoding/tokenizing/parsing
|
143
143
|
// the HTML. This contains an enumerated type flag, a source position, and then
|
144
144
|
// a union of fields containing data specific to the error.
|
145
|
-
typedef struct
|
145
|
+
typedef struct GumboInternalError {
|
146
146
|
// The type of error.
|
147
147
|
GumboErrorType type;
|
148
148
|
|
@@ -176,23 +176,23 @@ typedef struct _GumboError {
|
|
176
176
|
|
177
177
|
// Parser state, for GUMBO_ERR_PARSER and
|
178
178
|
// GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
|
179
|
-
struct
|
179
|
+
struct GumboInternalParserError parser;
|
180
180
|
} v;
|
181
181
|
} GumboError;
|
182
182
|
|
183
183
|
// Adds a new error to the parser's error list, and returns a pointer to it so
|
184
184
|
// that clients can fill out the rest of its fields. May return NULL if we're
|
185
185
|
// already over the max_errors field specified in GumboOptions.
|
186
|
-
GumboError* gumbo_add_error(struct
|
186
|
+
GumboError* gumbo_add_error(struct GumboInternalParser* parser);
|
187
187
|
|
188
188
|
// Initializes the errors vector in the parser.
|
189
|
-
void gumbo_init_errors(struct
|
189
|
+
void gumbo_init_errors(struct GumboInternalParser* errors);
|
190
190
|
|
191
191
|
// Frees all the errors in the 'errors_' field of the parser.
|
192
|
-
void gumbo_destroy_errors(struct
|
192
|
+
void gumbo_destroy_errors(struct GumboInternalParser* errors);
|
193
193
|
|
194
194
|
// Frees the memory used for a single GumboError.
|
195
|
-
void gumbo_error_destroy(struct
|
195
|
+
void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
|
196
196
|
|
197
197
|
// Prints an error to a string. This fills an empty GumboStringBuffer with a
|
198
198
|
// freshly-allocated buffer containing the error message text. The caller is
|
@@ -200,7 +200,7 @@ void gumbo_error_destroy(struct _GumboParser* parser, GumboError* error);
|
|
200
200
|
// the allocator specified in the GumboParser config and hence should be freed
|
201
201
|
// by gumbo_parser_deallocate().)
|
202
202
|
void gumbo_error_to_string(
|
203
|
-
struct
|
203
|
+
struct GumboInternalParser* parser, const GumboError* error,
|
204
204
|
GumboStringBuffer* output);
|
205
205
|
|
206
206
|
// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
|
@@ -209,13 +209,13 @@ void gumbo_error_to_string(
|
|
209
209
|
// allocated with the allocator specified in the GumboParser config and hence
|
210
210
|
// should be freed by gumbo_parser_deallocate().)
|
211
211
|
void gumbo_caret_diagnostic_to_string(
|
212
|
-
struct
|
212
|
+
struct GumboInternalParser* parser, const GumboError* error,
|
213
213
|
const char* source_text, GumboStringBuffer* output);
|
214
214
|
|
215
215
|
// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
|
216
216
|
// of writing to a string.
|
217
217
|
void gumbo_print_caret_diagnostic(
|
218
|
-
struct
|
218
|
+
struct GumboInternalParser* parser, const GumboError* error,
|
219
219
|
const char* source_text);
|
220
220
|
|
221
221
|
#ifdef __cplusplus
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -59,7 +59,7 @@ extern "C" {
|
|
59
59
|
* buffer of bytes), while the column field is often used to reference a
|
60
60
|
* particular column on a printable display, which nowadays is usually UTF-8.
|
61
61
|
*/
|
62
|
-
typedef struct
|
62
|
+
typedef struct {
|
63
63
|
unsigned int line;
|
64
64
|
unsigned int column;
|
65
65
|
unsigned int offset;
|
@@ -81,7 +81,7 @@ extern const GumboSourcePosition kGumboEmptySourcePosition;
|
|
81
81
|
* Clients should assume that it is not NUL-terminated, and should always use
|
82
82
|
* explicit lengths when manipulating them.
|
83
83
|
*/
|
84
|
-
typedef struct
|
84
|
+
typedef struct {
|
85
85
|
/** A pointer to the beginning of the string. NULL iff length == 0. */
|
86
86
|
const char* data;
|
87
87
|
|
@@ -116,7 +116,7 @@ bool gumbo_string_equals_ignore_case(
|
|
116
116
|
* library. Iteration can be done through inspecting the structure directly in
|
117
117
|
* a for-loop.
|
118
118
|
*/
|
119
|
-
typedef struct
|
119
|
+
typedef struct {
|
120
120
|
/** Data elements. This points to a dynamically-allocated array of capacity
|
121
121
|
* elements, each a void* to the element itself.
|
122
122
|
*/
|
@@ -151,7 +151,7 @@ int gumbo_vector_index_of(GumboVector* vector, void* element);
|
|
151
151
|
* efficiency benefits, by letting the parser work with enums instead of
|
152
152
|
* strings.
|
153
153
|
*/
|
154
|
-
typedef enum
|
154
|
+
typedef enum {
|
155
155
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
|
156
156
|
GUMBO_TAG_HTML,
|
157
157
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
|
@@ -365,7 +365,7 @@ GumboTag gumbo_tag_enum(const char* tagname);
|
|
365
365
|
* HTML includes special handling for XLink, XML, and XMLNS namespaces on
|
366
366
|
* attributes. Everything else goes in the generatic "NONE" namespace.
|
367
367
|
*/
|
368
|
-
typedef enum
|
368
|
+
typedef enum {
|
369
369
|
GUMBO_ATTR_NAMESPACE_NONE,
|
370
370
|
GUMBO_ATTR_NAMESPACE_XLINK,
|
371
371
|
GUMBO_ATTR_NAMESPACE_XML,
|
@@ -377,7 +377,7 @@ typedef enum _GumboAttributeNamespaceEnum {
|
|
377
377
|
* name-value pair, but also includes information about source locations and
|
378
378
|
* original source text.
|
379
379
|
*/
|
380
|
-
typedef struct
|
380
|
+
typedef struct {
|
381
381
|
/**
|
382
382
|
* The namespace for the attribute. This will usually be
|
383
383
|
* GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
|
@@ -438,14 +438,13 @@ typedef struct _GumboAttribute {
|
|
438
438
|
* and return it, or NULL if no such attribute exists. This uses a
|
439
439
|
* case-insensitive match, as HTML is case-insensitive.
|
440
440
|
*/
|
441
|
-
GumboAttribute* gumbo_get_attribute(
|
442
|
-
const struct _GumboVector* attrs, const char* name);
|
441
|
+
GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
|
443
442
|
|
444
443
|
/**
|
445
444
|
* Enum denoting the type of node. This determines the type of the node.v
|
446
445
|
* union.
|
447
446
|
*/
|
448
|
-
typedef enum
|
447
|
+
typedef enum {
|
449
448
|
/** Document node. v will be a GumboDocument. */
|
450
449
|
GUMBO_NODE_DOCUMENT,
|
451
450
|
/** Element node. v will be a GumboElement. */
|
@@ -464,10 +463,10 @@ typedef enum _GumboNodeType {
|
|
464
463
|
* Forward declaration of GumboNode so it can be used recursively in
|
465
464
|
* GumboNode.parent.
|
466
465
|
*/
|
467
|
-
typedef struct
|
466
|
+
typedef struct GumboInternalNode GumboNode;
|
468
467
|
|
469
468
|
/** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
|
470
|
-
typedef enum
|
469
|
+
typedef enum {
|
471
470
|
GUMBO_DOCTYPE_NO_QUIRKS,
|
472
471
|
GUMBO_DOCTYPE_QUIRKS,
|
473
472
|
GUMBO_DOCTYPE_LIMITED_QUIRKS
|
@@ -480,7 +479,7 @@ typedef enum _GumboQuirksModeEnum {
|
|
480
479
|
* <math> tag is in the MathML namespace, and anything else is inside the HTML
|
481
480
|
* namespace. No other namespaces are supported, so this can be an enum only.
|
482
481
|
*/
|
483
|
-
typedef enum
|
482
|
+
typedef enum {
|
484
483
|
GUMBO_NAMESPACE_HTML,
|
485
484
|
GUMBO_NAMESPACE_SVG,
|
486
485
|
GUMBO_NAMESPACE_MATHML
|
@@ -494,7 +493,7 @@ typedef enum _GumboNamespaceEnum {
|
|
494
493
|
* may not be allowed by a style guide, or track the prevalence of incorrect or
|
495
494
|
* tricky HTML code.
|
496
495
|
*/
|
497
|
-
typedef enum
|
496
|
+
typedef enum {
|
498
497
|
/**
|
499
498
|
* A normal node - both start and end tags appear in the source, nothing has
|
500
499
|
* been reparented.
|
@@ -568,7 +567,7 @@ typedef enum _GumboParseFlags {
|
|
568
567
|
/**
|
569
568
|
* Information specific to document nodes.
|
570
569
|
*/
|
571
|
-
typedef struct
|
570
|
+
typedef struct {
|
572
571
|
/**
|
573
572
|
* An array of GumboNodes, containing the children of this element. This will
|
574
573
|
* normally consist of the <html> element and any comment nodes found.
|
@@ -595,7 +594,7 @@ typedef struct _GumboDocument {
|
|
595
594
|
* The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
|
596
595
|
* This contains just a block of text and its position.
|
597
596
|
*/
|
598
|
-
typedef struct
|
597
|
+
typedef struct {
|
599
598
|
/**
|
600
599
|
* The text of this node, after entities have been parsed and decoded. For
|
601
600
|
* comment/cdata nodes, this does not include the comment delimiters.
|
@@ -619,7 +618,7 @@ typedef struct _GumboText {
|
|
619
618
|
* The struct used to represent all HTML elements. This contains information
|
620
619
|
* about the tag, attributes, and child nodes.
|
621
620
|
*/
|
622
|
-
typedef struct
|
621
|
+
typedef struct {
|
623
622
|
/**
|
624
623
|
* An array of GumboNodes, containing the children of this element. Pointers
|
625
624
|
* are owned.
|
@@ -664,7 +663,7 @@ typedef struct _GumboElement {
|
|
664
663
|
* A supertype for GumboElement and GumboText, so that we can include one
|
665
664
|
* generic type in lists of children and cast as necessary to subtypes.
|
666
665
|
*/
|
667
|
-
struct
|
666
|
+
struct GumboInternalNode {
|
668
667
|
/** The type of node that this is. */
|
669
668
|
GumboNodeType type;
|
670
669
|
|
@@ -710,7 +709,7 @@ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
|
|
710
709
|
* handling, etc.
|
711
710
|
* Use kGumboDefaultOptions for sensible defaults, and only set what you need.
|
712
711
|
*/
|
713
|
-
typedef struct
|
712
|
+
typedef struct GumboInternalOptions {
|
714
713
|
/** A memory allocator function. Default: malloc. */
|
715
714
|
GumboAllocatorFunction allocator;
|
716
715
|
|
@@ -749,7 +748,7 @@ typedef struct _GumboOptions {
|
|
749
748
|
extern const GumboOptions kGumboDefaultOptions;
|
750
749
|
|
751
750
|
/** The output struct containing the results of the parse. */
|
752
|
-
typedef struct
|
751
|
+
typedef struct GumboInternalOutput {
|
753
752
|
/**
|
754
753
|
* Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
|
755
754
|
* that contains the entire document as its child.
|
@@ -779,18 +778,18 @@ typedef struct _GumboOutput {
|
|
779
778
|
*
|
780
779
|
* This doesn't support buffers longer than 4 gigabytes.
|
781
780
|
*/
|
782
|
-
|
781
|
+
GumboOutput* gumbo_parse(const char* buffer);
|
783
782
|
|
784
783
|
/**
|
785
784
|
* Extended version of gumbo_parse that takes an explicit options structure,
|
786
785
|
* buffer, and length.
|
787
786
|
*/
|
788
|
-
|
787
|
+
GumboOutput* gumbo_parse_with_options(
|
789
788
|
const GumboOptions* options, const char* buffer, size_t buffer_length);
|
790
789
|
|
791
790
|
/** Release the memory used for the parse tree & parse errors. */
|
792
791
|
void gumbo_destroy_output(
|
793
|
-
const
|
792
|
+
const GumboOptions* options, GumboOutput* output);
|
794
793
|
|
795
794
|
|
796
795
|
#ifdef __cplusplus
|