nokogumbo 0.9 → 0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MDQzNzc1N2Q4MDg5Y2Q3OTgyMTg1MGExOGViNmIxOGIzODcyYzgwOQ==
5
+ data.tar.gz: !binary |-
6
+ YTYyNmMzZTEyNjFjODYyMjY3MTRmNmU1YWIwZjMyZTZiN2ZkMGU4Yg==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ NjMzYjg0YmU2NzZkZjUyN2YxZDZjOTM3ZWM5MDkxMjk0OWE5YjA3M2NjY2Y3
10
+ NDgwZTgxN2U0NWIyNzBkMmNlN2E4NjA1ODhmNGNiNTBhYzE3YWJkOTRiNjg3
11
+ YmMyZDVkYzIxZjljM2FjNDBhMmFiZTZmOWI1ODgzNmQwNzM4ZGY=
12
+ data.tar.gz: !binary |-
13
+ NjgzNTljOWNmMjY1MDBkZDc4ZmYzZGZlZDg2MDMxNzY0YTM2OWI3Y2Q1NDdl
14
+ M2FhNWE1M2U5YTQ0MjdlNTc0ZWM1OTg3NTkxMzdlNGU3MDdkNzBhZmNhZTUy
15
+ OTkzYWE0MjBiZTYyNmEyMGJiNDc2MTE4YWFiYWZkM2YwZDRlNTk=
data/README.md CHANGED
@@ -34,13 +34,16 @@ Notes
34
34
 
35
35
  * The `Nokogiri::HTML5.parse` function takes a string and passes it to the
36
36
  <code>gumbo_parse_with_options</code> method, using the default options.
37
- The resulting Gumbo parse tree is the walked, producing a
38
- [libxml2](http://xmlsoft.org/html/)
39
- [xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc).
40
- The original Gumbo parse tree is then destroyed, and single Nokogiri Ruby
41
- object is constructed to wrap the xmlDoc structure. Nokogiri only produces
42
- Ruby objects as necessary, so all searching is done using the underlying
43
- libxml2 libraries.
37
+ The resulting Gumbo parse tree is the walked.
38
+ * If the necessary Nokogiri and [libxml2](http://xmlsoft.org/html/) headers
39
+ can be found at installation time then an
40
+ [xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc) tree is produced
41
+ and a single Nokogiri Ruby object is constructed to wrap the xmlDoc
42
+ structure. Nokogiri only produces Ruby objects as necessary, so all
43
+ searching is done using the underlying libxml2 libraries.
44
+ * If the necessary headers are not present at installation time, then
45
+ Nokogiri Ruby objects are created for each Gumbo node. Other than
46
+ memory usage and CPU time, the results should be equivalent.
44
47
 
45
48
  * The `Nokogiri::HTML5.get` function takes care of following redirects,
46
49
  https, and determining the character encoding of the result, based on the
@@ -57,9 +60,11 @@ parser will be downloaded and compiled into the Gem itself.
57
60
  Installation
58
61
  ============
59
62
 
60
- * Execute `rake gem`
61
-
62
- * [sudo] gem install pkg/nokogumbo*.gem
63
+ git clone --recursive https://github.com/rubys/nokogumbo.git
64
+ cd nokogumbo
65
+ bundle install
66
+ rake gem
67
+ gem install pkg/nokogumbo*.gem
63
68
 
64
69
  Related efforts
65
70
  ============
@@ -0,0 +1,34 @@
1
+ require 'mkmf'
2
+ $CFLAGS += " -std=c99"
3
+
4
+ if have_library('xml2', 'xmlNewDoc')
5
+ # libxml2 libraries from http://www.xmlsoft.org/
6
+ pkg_config('libxml-2.0')
7
+
8
+ # nokogiri configuration from gem install
9
+ nokogiri_lib = Gem.find_files('nokogiri').
10
+ sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last
11
+ if nokogiri_lib
12
+ nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri')
13
+
14
+ # if that doesn't work, try workarounds found in Nokogiri's extconf
15
+ unless find_header('nokogiri.h', nokogiri_ext)
16
+ require "#{nokogiri_ext}/extconf.rb"
17
+ end
18
+
19
+ # if found, enable direct calls to Nokogiri (and libxml2)
20
+ $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext)
21
+ end
22
+ end
23
+
24
+ # add in gumbo-parser source from github if not already installed
25
+ unless have_library('gumbo', 'gumbo_parse')
26
+ rakehome = ENV['RAKEHOME'] || File.expand_path('../..')
27
+ unless File.exist? "#{rakehome}/ext/nokogumboc/gumbo.h"
28
+ require 'fileutils'
29
+ FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/*"],
30
+ "#{rakehome}/ext/nokogumboc"
31
+ end
32
+ end
33
+
34
+ create_makefile('nokogumboc')
@@ -0,0 +1,192 @@
1
+ //
2
+ // nokogumbo.c defines the following:
3
+ //
4
+ // class Nokogumbo
5
+ // def parse(utf8_string) # returns Nokogiri::HTML::Document
6
+ // end
7
+ //
8
+ // Processing starts by calling gumbo_parse_with_options. The resulting
9
+ // document tree is then walked:
10
+ //
11
+ // * if Nokogiri and libxml2 headers are available at compile time,
12
+ // (ifdef NGLIB) then a parallel libxml2 tree is constructed, and the
13
+ // final document is then wrapped using Nokogiri_wrap_xml_document.
14
+ // This approach reduces memory and CPU requirements as Ruby objects
15
+ // are only built when necessary.
16
+ //
17
+ // * if the necessary headers are not available at compile time, Nokogiri
18
+ // methods are called instead, producing the equivalent functionality.
19
+ //
20
+
21
+ #include <ruby.h>
22
+ #include <gumbo.h>
23
+
24
+ // class constants
25
+ static VALUE Document;
26
+
27
+ #ifdef NGLIB
28
+ #include <nokogiri.h>
29
+ #include <libxml/tree.h>
30
+
31
+ #define NIL NULL
32
+ #define CONST_CAST (xmlChar const*)
33
+ #else
34
+ #define NIL 0
35
+ #define CONST_CAST
36
+
37
+ // more class constants
38
+ static VALUE Element;
39
+ static VALUE Text;
40
+ static VALUE CDATA;
41
+ static VALUE Comment;
42
+
43
+ // interned symbols
44
+ static VALUE new;
45
+ static VALUE set_attribute;
46
+ static VALUE add_child;
47
+ static VALUE internal_subset;
48
+ static VALUE remove_;
49
+ static VALUE create_internal_subset;
50
+
51
+ // map libxml2 types to Ruby VALUE
52
+ #define xmlNodePtr VALUE
53
+ #define xmlDocPtr VALUE
54
+
55
+ // redefine libxml2 API as Ruby function calls
56
+ #define xmlNewDocNode(doc, ns, name, content) \
57
+ rb_funcall(Element, new, 2, rb_str_new2(name), doc)
58
+ #define xmlNewProp(element, name, value) \
59
+ rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value))
60
+ #define xmlNewDocText(doc, text) \
61
+ rb_funcall(Text, new, 2, rb_str_new2(text), doc)
62
+ #define xmlNewCDataBlock(doc, content, length) \
63
+ rb_funcall(CDATA, new, 2, rb_str_new(content, length), doc)
64
+ #define xmlNewDocComment(doc, text) \
65
+ rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
66
+ #define xmlAddChild(element, node) \
67
+ rb_funcall(element, add_child, 1, node)
68
+ #define xmlDocSetRootElement(doc, root) \
69
+ rb_funcall(doc, add_child, 1, root)
70
+ #define xmlCreateIntSubset(doc, name, external, system) \
71
+ rb_funcall(doc, create_internal_subset, 3, rb_str_new2(name), \
72
+ (external ? rb_str_new2(external) : Qnil), \
73
+ (system ? rb_str_new2(system) : Qnil));
74
+ #define Nokogiri_wrap_xml_document(klass, doc) \
75
+ doc
76
+
77
+ // remove internal subset from newly created documents
78
+ static VALUE xmlNewDoc(char* version) {
79
+ VALUE doc = rb_funcall(Document, new, 0);
80
+ rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
81
+ return doc;
82
+ }
83
+ #endif
84
+
85
+ // Build a Nokogiri Element for a given GumboElement (recursively)
86
+ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
87
+ // determine tag name for a given node
88
+ xmlNodePtr element;
89
+ if (node->tag != GUMBO_TAG_UNKNOWN) {
90
+ element = xmlNewDocNode(document, NIL,
91
+ CONST_CAST gumbo_normalized_tagname(node->tag), NIL);
92
+ } else {
93
+ GumboStringPiece tag = node->original_tag;
94
+ gumbo_tag_from_original_text(&tag);
95
+ char name[tag.length+1];
96
+ strncpy(name, tag.data, tag.length);
97
+ name[tag.length] = '\0';
98
+ element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL);
99
+ }
100
+
101
+ // add in the attributes
102
+ GumboVector* attrs = &node->attributes;
103
+ for (int i=0; i < attrs->length; i++) {
104
+ GumboAttribute *attr = attrs->data[i];
105
+ xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
106
+ }
107
+
108
+ // add in the children
109
+ GumboVector* children = &node->children;
110
+ for (int i=0; i < children->length; i++) {
111
+ GumboNode* child = children->data[i];
112
+
113
+ xmlNodePtr node = NIL;
114
+
115
+ switch (child->type) {
116
+ case GUMBO_NODE_ELEMENT:
117
+ node = walk_tree(document, &child->v.element);
118
+ break;
119
+ case GUMBO_NODE_WHITESPACE:
120
+ case GUMBO_NODE_TEXT:
121
+ node = xmlNewDocText(document, CONST_CAST child->v.text.text);
122
+ break;
123
+ case GUMBO_NODE_CDATA:
124
+ node = xmlNewCDataBlock(document,
125
+ CONST_CAST child->v.text.original_text.data,
126
+ (int) child->v.text.original_text.length);
127
+ break;
128
+ case GUMBO_NODE_COMMENT:
129
+ node = xmlNewDocComment(document, CONST_CAST child->v.text.text);
130
+ break;
131
+ case GUMBO_NODE_DOCUMENT:
132
+ break; // should never happen -- ignore
133
+ }
134
+
135
+ if (node) xmlAddChild(element, node);
136
+ }
137
+
138
+ return element;
139
+ }
140
+
141
+ // Parse a string using gumbo_parse into a Nokogiri document
142
+ static VALUE parse(VALUE self, VALUE string) {
143
+ GumboOutput *output = gumbo_parse_with_options(
144
+ &kGumboDefaultOptions, RSTRING_PTR(string),
145
+ (size_t) RSTRING_LEN(string)
146
+ );
147
+ xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
148
+ xmlNodePtr root = walk_tree(doc, &output->root->v.element);
149
+ xmlDocSetRootElement(doc, root);
150
+ if (output->document->v.document.has_doctype) {
151
+ const char *public = output->document->v.document.public_identifier;
152
+ const char *system = output->document->v.document.system_identifier;
153
+ xmlCreateIntSubset(doc, CONST_CAST "html",
154
+ (strlen(public) ? CONST_CAST public : NIL),
155
+ (strlen(system) ? CONST_CAST system : NIL));
156
+ }
157
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
158
+
159
+ return Nokogiri_wrap_xml_document(Document, doc);
160
+ }
161
+
162
+ // Initialize the Nokogumbo class and fetch constants we will use later
163
+ void Init_nokogumboc() {
164
+ rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
165
+ rb_require("nokogiri");
166
+
167
+ // class constants
168
+ VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
169
+ VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
170
+ Document = rb_const_get(HTML, rb_intern("Document"));
171
+
172
+ #ifndef NGLIB
173
+ // more class constants
174
+ VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
175
+ Element = rb_const_get(XML, rb_intern("Element"));
176
+ Text = rb_const_get(XML, rb_intern("Text"));
177
+ CDATA = rb_const_get(XML, rb_intern("CDATA"));
178
+ Comment = rb_const_get(XML, rb_intern("Comment"));
179
+
180
+ // interned symbols
181
+ new = rb_intern("new");
182
+ set_attribute = rb_intern("set_attribute");
183
+ add_child = rb_intern("add_child");
184
+ internal_subset = rb_intern("internal_subset");
185
+ remove_ = rb_intern("remove");
186
+ create_internal_subset = rb_intern("create_internal_subset");
187
+ #endif
188
+
189
+ // define Nokogumbo class with a singleton parse method
190
+ VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
191
+ rb_define_singleton_method(Gumbo, "parse", parse, 1);
192
+ }
@@ -23,10 +23,10 @@
23
23
 
24
24
  #include "util.h"
25
25
 
26
- struct _GumboParser;
26
+ struct GumboInternalParser;
27
27
 
28
28
  GumboAttribute* gumbo_get_attribute(
29
- const struct _GumboVector* attributes, const char* name) {
29
+ const GumboVector* attributes, const char* name) {
30
30
  for (int i = 0; i < attributes->length; ++i) {
31
31
  GumboAttribute* attr = attributes->data[i];
32
32
  if (!strcasecmp(attr->name, name)) {
@@ -37,7 +37,7 @@ GumboAttribute* gumbo_get_attribute(
37
37
  }
38
38
 
39
39
  void gumbo_destroy_attribute(
40
- struct _GumboParser* parser, GumboAttribute* attribute) {
40
+ struct GumboInternalParser* parser, GumboAttribute* attribute) {
41
41
  gumbo_parser_deallocate(parser, (void*) attribute->name);
42
42
  gumbo_parser_deallocate(parser, (void*) attribute->value);
43
43
  gumbo_parser_deallocate(parser, (void*) attribute);
@@ -23,12 +23,12 @@
23
23
  extern "C" {
24
24
  #endif
25
25
 
26
- struct _GumboParser;
26
+ struct GumboInternalParser;
27
27
 
28
28
  // Release the memory used for an GumboAttribute, including the attribute
29
29
  // itself.
30
30
  void gumbo_destroy_attribute(
31
- struct _GumboParser* parser, GumboAttribute* attribute);
31
+ struct GumboInternalParser* parser, GumboAttribute* attribute);
32
32
 
33
33
  #ifdef __cplusplus
34
34
  }
@@ -26,7 +26,7 @@
26
26
  #include "utf8.h"
27
27
  #include "util.h"
28
28
 
29
- struct _GumboParser;
29
+ struct GumboInternalParser;
30
30
 
31
31
  const int kGumboNoChar = -1;
32
32
 
@@ -2351,7 +2351,7 @@ static int parse_digit(int c, bool allow_hex) {
2351
2351
  }
2352
2352
 
2353
2353
  static void add_no_digit_error(
2354
- struct _GumboParser* parser, Utf8Iterator* input) {
2354
+ struct GumboInternalParser* parser, Utf8Iterator* input) {
2355
2355
  GumboError* error = gumbo_add_error(parser);
2356
2356
  if (!error) {
2357
2357
  return;
@@ -2361,7 +2361,7 @@ static void add_no_digit_error(
2361
2361
  }
2362
2362
 
2363
2363
  static void add_codepoint_error(
2364
- struct _GumboParser* parser, Utf8Iterator* input,
2364
+ struct GumboInternalParser* parser, Utf8Iterator* input,
2365
2365
  GumboErrorType type, int codepoint) {
2366
2366
  GumboError* error = gumbo_add_error(parser);
2367
2367
  if (!error) {
@@ -2373,7 +2373,7 @@ static void add_codepoint_error(
2373
2373
  }
2374
2374
 
2375
2375
  static void add_named_reference_error(
2376
- struct _GumboParser* parser, Utf8Iterator* input,
2376
+ struct GumboInternalParser* parser, Utf8Iterator* input,
2377
2377
  GumboErrorType type, GumboStringPiece text) {
2378
2378
  GumboError* error = gumbo_add_error(parser);
2379
2379
  if (!error) {
@@ -2394,7 +2394,7 @@ static int maybe_replace_codepoint(int codepoint) {
2394
2394
  }
2395
2395
 
2396
2396
  static bool consume_numeric_ref(
2397
- struct _GumboParser* parser, Utf8Iterator* input, int* output) {
2397
+ struct GumboInternalParser* parser, Utf8Iterator* input, int* output) {
2398
2398
  utf8iterator_next(input);
2399
2399
  bool is_hex = false;
2400
2400
  int c = utf8iterator_current(input);
@@ -2475,7 +2475,7 @@ static bool is_legal_attribute_char_next(Utf8Iterator* input) {
2475
2475
  }
2476
2476
 
2477
2477
  static bool maybe_add_invalid_named_reference(
2478
- struct _GumboParser* parser, Utf8Iterator* input) {
2478
+ struct GumboInternalParser* parser, Utf8Iterator* input) {
2479
2479
  // The iterator will always be reset in this code path, so we don't need to
2480
2480
  // worry about consuming characters.
2481
2481
  const char* start = utf8iterator_get_char_pointer(input);
@@ -2498,7 +2498,7 @@ static bool maybe_add_invalid_named_reference(
2498
2498
  }
2499
2499
 
2500
2500
  static bool consume_named_ref(
2501
- struct _GumboParser* parser, Utf8Iterator* input, bool is_in_attribute,
2501
+ struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
2502
2502
  OneOrTwoCodepoints* output) {
2503
2503
  assert(output->first == kGumboNoChar);
2504
2504
  const NamedCharRef* char_ref = find_named_char_ref(input);
@@ -2530,7 +2530,7 @@ static bool consume_named_ref(
2530
2530
  }
2531
2531
 
2532
2532
  bool consume_char_ref(
2533
- struct _GumboParser* parser, struct _Utf8Iterator* input,
2533
+ struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
2534
2534
  int additional_allowed_char, bool is_in_attribute,
2535
2535
  OneOrTwoCodepoints* output) {
2536
2536
  utf8iterator_mark(input);
@@ -27,8 +27,8 @@
27
27
  extern "C" {
28
28
  #endif
29
29
 
30
- struct _GumboParser;
31
- struct _Utf8Iterator;
30
+ struct GumboInternalParser;
31
+ struct GumboInternalUtf8Iterator;
32
32
 
33
33
  // Value that indicates no character was produced.
34
34
  extern const int kGumboNoChar;
@@ -50,7 +50,7 @@ typedef struct {
50
50
  // space for the "additional allowed char" when the spec says "with no
51
51
  // additional allowed char". Returns false on parse error, true otherwise.
52
52
  bool consume_char_ref(
53
- struct _GumboParser* parser, struct _Utf8Iterator* input,
53
+ struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
54
54
  int additional_allowed_char, bool is_in_attribute,
55
55
  OneOrTwoCodepoints* output);
56
56
 
@@ -30,7 +30,7 @@
30
30
  extern "C" {
31
31
  #endif
32
32
 
33
- struct _GumboParser;
33
+ struct GumboInternalParser;
34
34
 
35
35
  typedef enum {
36
36
  GUMBO_ERR_UTF8_INVALID,
@@ -78,7 +78,7 @@ typedef enum {
78
78
  } GumboErrorType;
79
79
 
80
80
  // Additional data for duplicated attributes.
81
- typedef struct _GumboDuplicateAttrError {
81
+ typedef struct GumboInternalDuplicateAttrError {
82
82
  // The name of the attribute. Owned by this struct.
83
83
  const char* name;
84
84
 
@@ -114,7 +114,7 @@ typedef enum {
114
114
  // Additional data for tokenizer errors.
115
115
  // This records the current state and codepoint encountered - this is usually
116
116
  // enough to reconstruct what went wrong and provide a friendly error message.
117
- typedef struct _GumboTokenizerError {
117
+ typedef struct GumboInternalTokenizerError {
118
118
  // The bad codepoint encountered.
119
119
  int codepoint;
120
120
 
@@ -123,7 +123,7 @@ typedef struct _GumboTokenizerError {
123
123
  } GumboTokenizerError;
124
124
 
125
125
  // Additional data for parse errors.
126
- typedef struct _GumboParserError {
126
+ typedef struct GumboInternalParserError {
127
127
  // The type of input token that resulted in this error.
128
128
  GumboTokenType input_type;
129
129
 
@@ -142,7 +142,7 @@ typedef struct _GumboParserError {
142
142
  // The overall error struct representing an error in decoding/tokenizing/parsing
143
143
  // the HTML. This contains an enumerated type flag, a source position, and then
144
144
  // a union of fields containing data specific to the error.
145
- typedef struct _GumboError {
145
+ typedef struct GumboInternalError {
146
146
  // The type of error.
147
147
  GumboErrorType type;
148
148
 
@@ -176,23 +176,23 @@ typedef struct _GumboError {
176
176
 
177
177
  // Parser state, for GUMBO_ERR_PARSER and
178
178
  // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
179
- struct _GumboParserError parser;
179
+ struct GumboInternalParserError parser;
180
180
  } v;
181
181
  } GumboError;
182
182
 
183
183
  // Adds a new error to the parser's error list, and returns a pointer to it so
184
184
  // that clients can fill out the rest of its fields. May return NULL if we're
185
185
  // already over the max_errors field specified in GumboOptions.
186
- GumboError* gumbo_add_error(struct _GumboParser* parser);
186
+ GumboError* gumbo_add_error(struct GumboInternalParser* parser);
187
187
 
188
188
  // Initializes the errors vector in the parser.
189
- void gumbo_init_errors(struct _GumboParser* errors);
189
+ void gumbo_init_errors(struct GumboInternalParser* errors);
190
190
 
191
191
  // Frees all the errors in the 'errors_' field of the parser.
192
- void gumbo_destroy_errors(struct _GumboParser* errors);
192
+ void gumbo_destroy_errors(struct GumboInternalParser* errors);
193
193
 
194
194
  // Frees the memory used for a single GumboError.
195
- void gumbo_error_destroy(struct _GumboParser* parser, GumboError* error);
195
+ void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
196
196
 
197
197
  // Prints an error to a string. This fills an empty GumboStringBuffer with a
198
198
  // freshly-allocated buffer containing the error message text. The caller is
@@ -200,7 +200,7 @@ void gumbo_error_destroy(struct _GumboParser* parser, GumboError* error);
200
200
  // the allocator specified in the GumboParser config and hence should be freed
201
201
  // by gumbo_parser_deallocate().)
202
202
  void gumbo_error_to_string(
203
- struct _GumboParser* parser, const GumboError* error,
203
+ struct GumboInternalParser* parser, const GumboError* error,
204
204
  GumboStringBuffer* output);
205
205
 
206
206
  // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
@@ -209,13 +209,13 @@ void gumbo_error_to_string(
209
209
  // allocated with the allocator specified in the GumboParser config and hence
210
210
  // should be freed by gumbo_parser_deallocate().)
211
211
  void gumbo_caret_diagnostic_to_string(
212
- struct _GumboParser* parser, const GumboError* error,
212
+ struct GumboInternalParser* parser, const GumboError* error,
213
213
  const char* source_text, GumboStringBuffer* output);
214
214
 
215
215
  // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
216
216
  // of writing to a string.
217
217
  void gumbo_print_caret_diagnostic(
218
- struct _GumboParser* parser, const GumboError* error,
218
+ struct GumboInternalParser* parser, const GumboError* error,
219
219
  const char* source_text);
220
220
 
221
221
  #ifdef __cplusplus
@@ -59,7 +59,7 @@ extern "C" {
59
59
  * buffer of bytes), while the column field is often used to reference a
60
60
  * particular column on a printable display, which nowadays is usually UTF-8.
61
61
  */
62
- typedef struct _GumboSourcePosition {
62
+ typedef struct {
63
63
  unsigned int line;
64
64
  unsigned int column;
65
65
  unsigned int offset;
@@ -81,7 +81,7 @@ extern const GumboSourcePosition kGumboEmptySourcePosition;
81
81
  * Clients should assume that it is not NUL-terminated, and should always use
82
82
  * explicit lengths when manipulating them.
83
83
  */
84
- typedef struct _GumboStringPiece {
84
+ typedef struct {
85
85
  /** A pointer to the beginning of the string. NULL iff length == 0. */
86
86
  const char* data;
87
87
 
@@ -116,7 +116,7 @@ bool gumbo_string_equals_ignore_case(
116
116
  * library. Iteration can be done through inspecting the structure directly in
117
117
  * a for-loop.
118
118
  */
119
- typedef struct _GumboVector {
119
+ typedef struct {
120
120
  /** Data elements. This points to a dynamically-allocated array of capacity
121
121
  * elements, each a void* to the element itself.
122
122
  */
@@ -151,7 +151,7 @@ int gumbo_vector_index_of(GumboVector* vector, void* element);
151
151
  * efficiency benefits, by letting the parser work with enums instead of
152
152
  * strings.
153
153
  */
154
- typedef enum _GumboTag {
154
+ typedef enum {
155
155
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
156
156
  GUMBO_TAG_HTML,
157
157
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
@@ -365,7 +365,7 @@ GumboTag gumbo_tag_enum(const char* tagname);
365
365
  * HTML includes special handling for XLink, XML, and XMLNS namespaces on
366
366
  * attributes. Everything else goes in the generatic "NONE" namespace.
367
367
  */
368
- typedef enum _GumboAttributeNamespaceEnum {
368
+ typedef enum {
369
369
  GUMBO_ATTR_NAMESPACE_NONE,
370
370
  GUMBO_ATTR_NAMESPACE_XLINK,
371
371
  GUMBO_ATTR_NAMESPACE_XML,
@@ -377,7 +377,7 @@ typedef enum _GumboAttributeNamespaceEnum {
377
377
  * name-value pair, but also includes information about source locations and
378
378
  * original source text.
379
379
  */
380
- typedef struct _GumboAttribute {
380
+ typedef struct {
381
381
  /**
382
382
  * The namespace for the attribute. This will usually be
383
383
  * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
@@ -438,14 +438,13 @@ typedef struct _GumboAttribute {
438
438
  * and return it, or NULL if no such attribute exists. This uses a
439
439
  * case-insensitive match, as HTML is case-insensitive.
440
440
  */
441
- GumboAttribute* gumbo_get_attribute(
442
- const struct _GumboVector* attrs, const char* name);
441
+ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
443
442
 
444
443
  /**
445
444
  * Enum denoting the type of node. This determines the type of the node.v
446
445
  * union.
447
446
  */
448
- typedef enum _GumboNodeType {
447
+ typedef enum {
449
448
  /** Document node. v will be a GumboDocument. */
450
449
  GUMBO_NODE_DOCUMENT,
451
450
  /** Element node. v will be a GumboElement. */
@@ -464,10 +463,10 @@ typedef enum _GumboNodeType {
464
463
  * Forward declaration of GumboNode so it can be used recursively in
465
464
  * GumboNode.parent.
466
465
  */
467
- typedef struct _GumboNode GumboNode;
466
+ typedef struct GumboInternalNode GumboNode;
468
467
 
469
468
  /** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
470
- typedef enum _GumboQuirksModeEnum {
469
+ typedef enum {
471
470
  GUMBO_DOCTYPE_NO_QUIRKS,
472
471
  GUMBO_DOCTYPE_QUIRKS,
473
472
  GUMBO_DOCTYPE_LIMITED_QUIRKS
@@ -480,7 +479,7 @@ typedef enum _GumboQuirksModeEnum {
480
479
  * <math> tag is in the MathML namespace, and anything else is inside the HTML
481
480
  * namespace. No other namespaces are supported, so this can be an enum only.
482
481
  */
483
- typedef enum _GumboNamespaceEnum {
482
+ typedef enum {
484
483
  GUMBO_NAMESPACE_HTML,
485
484
  GUMBO_NAMESPACE_SVG,
486
485
  GUMBO_NAMESPACE_MATHML
@@ -494,7 +493,7 @@ typedef enum _GumboNamespaceEnum {
494
493
  * may not be allowed by a style guide, or track the prevalence of incorrect or
495
494
  * tricky HTML code.
496
495
  */
497
- typedef enum _GumboParseFlags {
496
+ typedef enum {
498
497
  /**
499
498
  * A normal node - both start and end tags appear in the source, nothing has
500
499
  * been reparented.
@@ -568,7 +567,7 @@ typedef enum _GumboParseFlags {
568
567
  /**
569
568
  * Information specific to document nodes.
570
569
  */
571
- typedef struct _GumboDocument {
570
+ typedef struct {
572
571
  /**
573
572
  * An array of GumboNodes, containing the children of this element. This will
574
573
  * normally consist of the <html> element and any comment nodes found.
@@ -595,7 +594,7 @@ typedef struct _GumboDocument {
595
594
  * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
596
595
  * This contains just a block of text and its position.
597
596
  */
598
- typedef struct _GumboText {
597
+ typedef struct {
599
598
  /**
600
599
  * The text of this node, after entities have been parsed and decoded. For
601
600
  * comment/cdata nodes, this does not include the comment delimiters.
@@ -619,7 +618,7 @@ typedef struct _GumboText {
619
618
  * The struct used to represent all HTML elements. This contains information
620
619
  * about the tag, attributes, and child nodes.
621
620
  */
622
- typedef struct _GumboElement {
621
+ typedef struct {
623
622
  /**
624
623
  * An array of GumboNodes, containing the children of this element. Pointers
625
624
  * are owned.
@@ -664,7 +663,7 @@ typedef struct _GumboElement {
664
663
  * A supertype for GumboElement and GumboText, so that we can include one
665
664
  * generic type in lists of children and cast as necessary to subtypes.
666
665
  */
667
- struct _GumboNode {
666
+ struct GumboInternalNode {
668
667
  /** The type of node that this is. */
669
668
  GumboNodeType type;
670
669
 
@@ -710,7 +709,7 @@ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
710
709
  * handling, etc.
711
710
  * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
712
711
  */
713
- typedef struct _GumboOptions {
712
+ typedef struct GumboInternalOptions {
714
713
  /** A memory allocator function. Default: malloc. */
715
714
  GumboAllocatorFunction allocator;
716
715
 
@@ -749,7 +748,7 @@ typedef struct _GumboOptions {
749
748
  extern const GumboOptions kGumboDefaultOptions;
750
749
 
751
750
  /** The output struct containing the results of the parse. */
752
- typedef struct _GumboOutput {
751
+ typedef struct GumboInternalOutput {
753
752
  /**
754
753
  * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
755
754
  * that contains the entire document as its child.
@@ -779,18 +778,18 @@ typedef struct _GumboOutput {
779
778
  *
780
779
  * This doesn't support buffers longer than 4 gigabytes.
781
780
  */
782
- struct _GumboOutput* gumbo_parse(const char* buffer);
781
+ GumboOutput* gumbo_parse(const char* buffer);
783
782
 
784
783
  /**
785
784
  * Extended version of gumbo_parse that takes an explicit options structure,
786
785
  * buffer, and length.
787
786
  */
788
- struct _GumboOutput* gumbo_parse_with_options(
787
+ GumboOutput* gumbo_parse_with_options(
789
788
  const GumboOptions* options, const char* buffer, size_t buffer_length);
790
789
 
791
790
  /** Release the memory used for the parse tree & parse errors. */
792
791
  void gumbo_destroy_output(
793
- const struct _GumboOptions* options, GumboOutput* output);
792
+ const GumboOptions* options, GumboOutput* output);
794
793
 
795
794
 
796
795
  #ifdef __cplusplus