nokogumbo 0.9 → 0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MDQzNzc1N2Q4MDg5Y2Q3OTgyMTg1MGExOGViNmIxOGIzODcyYzgwOQ==
5
+ data.tar.gz: !binary |-
6
+ YTYyNmMzZTEyNjFjODYyMjY3MTRmNmU1YWIwZjMyZTZiN2ZkMGU4Yg==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ NjMzYjg0YmU2NzZkZjUyN2YxZDZjOTM3ZWM5MDkxMjk0OWE5YjA3M2NjY2Y3
10
+ NDgwZTgxN2U0NWIyNzBkMmNlN2E4NjA1ODhmNGNiNTBhYzE3YWJkOTRiNjg3
11
+ YmMyZDVkYzIxZjljM2FjNDBhMmFiZTZmOWI1ODgzNmQwNzM4ZGY=
12
+ data.tar.gz: !binary |-
13
+ NjgzNTljOWNmMjY1MDBkZDc4ZmYzZGZlZDg2MDMxNzY0YTM2OWI3Y2Q1NDdl
14
+ M2FhNWE1M2U5YTQ0MjdlNTc0ZWM1OTg3NTkxMzdlNGU3MDdkNzBhZmNhZTUy
15
+ OTkzYWE0MjBiZTYyNmEyMGJiNDc2MTE4YWFiYWZkM2YwZDRlNTk=
data/README.md CHANGED
@@ -34,13 +34,16 @@ Notes
34
34
 
35
35
  * The `Nokogiri::HTML5.parse` function takes a string and passes it to the
36
36
  <code>gumbo_parse_with_options</code> method, using the default options.
37
- The resulting Gumbo parse tree is the walked, producing a
38
- [libxml2](http://xmlsoft.org/html/)
39
- [xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc).
40
- The original Gumbo parse tree is then destroyed, and single Nokogiri Ruby
41
- object is constructed to wrap the xmlDoc structure. Nokogiri only produces
42
- Ruby objects as necessary, so all searching is done using the underlying
43
- libxml2 libraries.
37
+ The resulting Gumbo parse tree is the walked.
38
+ * If the necessary Nokogiri and [libxml2](http://xmlsoft.org/html/) headers
39
+ can be found at installation time then an
40
+ [xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc) tree is produced
41
+ and a single Nokogiri Ruby object is constructed to wrap the xmlDoc
42
+ structure. Nokogiri only produces Ruby objects as necessary, so all
43
+ searching is done using the underlying libxml2 libraries.
44
+ * If the necessary headers are not present at installation time, then
45
+ Nokogiri Ruby objects are created for each Gumbo node. Other than
46
+ memory usage and CPU time, the results should be equivalent.
44
47
 
45
48
  * The `Nokogiri::HTML5.get` function takes care of following redirects,
46
49
  https, and determining the character encoding of the result, based on the
@@ -57,9 +60,11 @@ parser will be downloaded and compiled into the Gem itself.
57
60
  Installation
58
61
  ============
59
62
 
60
- * Execute `rake gem`
61
-
62
- * [sudo] gem install pkg/nokogumbo*.gem
63
+ git clone --recursive https://github.com/rubys/nokogumbo.git
64
+ cd nokogumbo
65
+ bundle install
66
+ rake gem
67
+ gem install pkg/nokogumbo*.gem
63
68
 
64
69
  Related efforts
65
70
  ============
@@ -0,0 +1,34 @@
1
+ require 'mkmf'
2
+ $CFLAGS += " -std=c99"
3
+
4
+ if have_library('xml2', 'xmlNewDoc')
5
+ # libxml2 libraries from http://www.xmlsoft.org/
6
+ pkg_config('libxml-2.0')
7
+
8
+ # nokogiri configuration from gem install
9
+ nokogiri_lib = Gem.find_files('nokogiri').
10
+ sort_by {|name| name[/nokogiri-([\d.]+)/,1].split('.').map(&:to_i)}.last
11
+ if nokogiri_lib
12
+ nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri')
13
+
14
+ # if that doesn't work, try workarounds found in Nokogiri's extconf
15
+ unless find_header('nokogiri.h', nokogiri_ext)
16
+ require "#{nokogiri_ext}/extconf.rb"
17
+ end
18
+
19
+ # if found, enable direct calls to Nokogiri (and libxml2)
20
+ $CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext)
21
+ end
22
+ end
23
+
24
+ # add in gumbo-parser source from github if not already installed
25
+ unless have_library('gumbo', 'gumbo_parse')
26
+ rakehome = ENV['RAKEHOME'] || File.expand_path('../..')
27
+ unless File.exist? "#{rakehome}/ext/nokogumboc/gumbo.h"
28
+ require 'fileutils'
29
+ FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/*"],
30
+ "#{rakehome}/ext/nokogumboc"
31
+ end
32
+ end
33
+
34
+ create_makefile('nokogumboc')
@@ -0,0 +1,192 @@
1
+ //
2
+ // nokogumbo.c defines the following:
3
+ //
4
+ // class Nokogumbo
5
+ // def parse(utf8_string) # returns Nokogiri::HTML::Document
6
+ // end
7
+ //
8
+ // Processing starts by calling gumbo_parse_with_options. The resulting
9
+ // document tree is then walked:
10
+ //
11
+ // * if Nokogiri and libxml2 headers are available at compile time,
12
+ // (ifdef NGLIB) then a parallel libxml2 tree is constructed, and the
13
+ // final document is then wrapped using Nokogiri_wrap_xml_document.
14
+ // This approach reduces memory and CPU requirements as Ruby objects
15
+ // are only built when necessary.
16
+ //
17
+ // * if the necessary headers are not available at compile time, Nokogiri
18
+ // methods are called instead, producing the equivalent functionality.
19
+ //
20
+
21
+ #include <ruby.h>
22
+ #include <gumbo.h>
23
+
24
+ // class constants
25
+ static VALUE Document;
26
+
27
+ #ifdef NGLIB
28
+ #include <nokogiri.h>
29
+ #include <libxml/tree.h>
30
+
31
+ #define NIL NULL
32
+ #define CONST_CAST (xmlChar const*)
33
+ #else
34
+ #define NIL 0
35
+ #define CONST_CAST
36
+
37
+ // more class constants
38
+ static VALUE Element;
39
+ static VALUE Text;
40
+ static VALUE CDATA;
41
+ static VALUE Comment;
42
+
43
+ // interned symbols
44
+ static VALUE new;
45
+ static VALUE set_attribute;
46
+ static VALUE add_child;
47
+ static VALUE internal_subset;
48
+ static VALUE remove_;
49
+ static VALUE create_internal_subset;
50
+
51
+ // map libxml2 types to Ruby VALUE
52
+ #define xmlNodePtr VALUE
53
+ #define xmlDocPtr VALUE
54
+
55
+ // redefine libxml2 API as Ruby function calls
56
+ #define xmlNewDocNode(doc, ns, name, content) \
57
+ rb_funcall(Element, new, 2, rb_str_new2(name), doc)
58
+ #define xmlNewProp(element, name, value) \
59
+ rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value))
60
+ #define xmlNewDocText(doc, text) \
61
+ rb_funcall(Text, new, 2, rb_str_new2(text), doc)
62
+ #define xmlNewCDataBlock(doc, content, length) \
63
+ rb_funcall(CDATA, new, 2, rb_str_new(content, length), doc)
64
+ #define xmlNewDocComment(doc, text) \
65
+ rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
66
+ #define xmlAddChild(element, node) \
67
+ rb_funcall(element, add_child, 1, node)
68
+ #define xmlDocSetRootElement(doc, root) \
69
+ rb_funcall(doc, add_child, 1, root)
70
+ #define xmlCreateIntSubset(doc, name, external, system) \
71
+ rb_funcall(doc, create_internal_subset, 3, rb_str_new2(name), \
72
+ (external ? rb_str_new2(external) : Qnil), \
73
+ (system ? rb_str_new2(system) : Qnil));
74
+ #define Nokogiri_wrap_xml_document(klass, doc) \
75
+ doc
76
+
77
+ // remove internal subset from newly created documents
78
+ static VALUE xmlNewDoc(char* version) {
79
+ VALUE doc = rb_funcall(Document, new, 0);
80
+ rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
81
+ return doc;
82
+ }
83
+ #endif
84
+
85
+ // Build a Nokogiri Element for a given GumboElement (recursively)
86
+ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
87
+ // determine tag name for a given node
88
+ xmlNodePtr element;
89
+ if (node->tag != GUMBO_TAG_UNKNOWN) {
90
+ element = xmlNewDocNode(document, NIL,
91
+ CONST_CAST gumbo_normalized_tagname(node->tag), NIL);
92
+ } else {
93
+ GumboStringPiece tag = node->original_tag;
94
+ gumbo_tag_from_original_text(&tag);
95
+ char name[tag.length+1];
96
+ strncpy(name, tag.data, tag.length);
97
+ name[tag.length] = '\0';
98
+ element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL);
99
+ }
100
+
101
+ // add in the attributes
102
+ GumboVector* attrs = &node->attributes;
103
+ for (int i=0; i < attrs->length; i++) {
104
+ GumboAttribute *attr = attrs->data[i];
105
+ xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
106
+ }
107
+
108
+ // add in the children
109
+ GumboVector* children = &node->children;
110
+ for (int i=0; i < children->length; i++) {
111
+ GumboNode* child = children->data[i];
112
+
113
+ xmlNodePtr node = NIL;
114
+
115
+ switch (child->type) {
116
+ case GUMBO_NODE_ELEMENT:
117
+ node = walk_tree(document, &child->v.element);
118
+ break;
119
+ case GUMBO_NODE_WHITESPACE:
120
+ case GUMBO_NODE_TEXT:
121
+ node = xmlNewDocText(document, CONST_CAST child->v.text.text);
122
+ break;
123
+ case GUMBO_NODE_CDATA:
124
+ node = xmlNewCDataBlock(document,
125
+ CONST_CAST child->v.text.original_text.data,
126
+ (int) child->v.text.original_text.length);
127
+ break;
128
+ case GUMBO_NODE_COMMENT:
129
+ node = xmlNewDocComment(document, CONST_CAST child->v.text.text);
130
+ break;
131
+ case GUMBO_NODE_DOCUMENT:
132
+ break; // should never happen -- ignore
133
+ }
134
+
135
+ if (node) xmlAddChild(element, node);
136
+ }
137
+
138
+ return element;
139
+ }
140
+
141
+ // Parse a string using gumbo_parse into a Nokogiri document
142
+ static VALUE parse(VALUE self, VALUE string) {
143
+ GumboOutput *output = gumbo_parse_with_options(
144
+ &kGumboDefaultOptions, RSTRING_PTR(string),
145
+ (size_t) RSTRING_LEN(string)
146
+ );
147
+ xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
148
+ xmlNodePtr root = walk_tree(doc, &output->root->v.element);
149
+ xmlDocSetRootElement(doc, root);
150
+ if (output->document->v.document.has_doctype) {
151
+ const char *public = output->document->v.document.public_identifier;
152
+ const char *system = output->document->v.document.system_identifier;
153
+ xmlCreateIntSubset(doc, CONST_CAST "html",
154
+ (strlen(public) ? CONST_CAST public : NIL),
155
+ (strlen(system) ? CONST_CAST system : NIL));
156
+ }
157
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
158
+
159
+ return Nokogiri_wrap_xml_document(Document, doc);
160
+ }
161
+
162
+ // Initialize the Nokogumbo class and fetch constants we will use later
163
+ void Init_nokogumboc() {
164
+ rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
165
+ rb_require("nokogiri");
166
+
167
+ // class constants
168
+ VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
169
+ VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
170
+ Document = rb_const_get(HTML, rb_intern("Document"));
171
+
172
+ #ifndef NGLIB
173
+ // more class constants
174
+ VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
175
+ Element = rb_const_get(XML, rb_intern("Element"));
176
+ Text = rb_const_get(XML, rb_intern("Text"));
177
+ CDATA = rb_const_get(XML, rb_intern("CDATA"));
178
+ Comment = rb_const_get(XML, rb_intern("Comment"));
179
+
180
+ // interned symbols
181
+ new = rb_intern("new");
182
+ set_attribute = rb_intern("set_attribute");
183
+ add_child = rb_intern("add_child");
184
+ internal_subset = rb_intern("internal_subset");
185
+ remove_ = rb_intern("remove");
186
+ create_internal_subset = rb_intern("create_internal_subset");
187
+ #endif
188
+
189
+ // define Nokogumbo class with a singleton parse method
190
+ VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
191
+ rb_define_singleton_method(Gumbo, "parse", parse, 1);
192
+ }
@@ -23,10 +23,10 @@
23
23
 
24
24
  #include "util.h"
25
25
 
26
- struct _GumboParser;
26
+ struct GumboInternalParser;
27
27
 
28
28
  GumboAttribute* gumbo_get_attribute(
29
- const struct _GumboVector* attributes, const char* name) {
29
+ const GumboVector* attributes, const char* name) {
30
30
  for (int i = 0; i < attributes->length; ++i) {
31
31
  GumboAttribute* attr = attributes->data[i];
32
32
  if (!strcasecmp(attr->name, name)) {
@@ -37,7 +37,7 @@ GumboAttribute* gumbo_get_attribute(
37
37
  }
38
38
 
39
39
  void gumbo_destroy_attribute(
40
- struct _GumboParser* parser, GumboAttribute* attribute) {
40
+ struct GumboInternalParser* parser, GumboAttribute* attribute) {
41
41
  gumbo_parser_deallocate(parser, (void*) attribute->name);
42
42
  gumbo_parser_deallocate(parser, (void*) attribute->value);
43
43
  gumbo_parser_deallocate(parser, (void*) attribute);
@@ -23,12 +23,12 @@
23
23
  extern "C" {
24
24
  #endif
25
25
 
26
- struct _GumboParser;
26
+ struct GumboInternalParser;
27
27
 
28
28
  // Release the memory used for an GumboAttribute, including the attribute
29
29
  // itself.
30
30
  void gumbo_destroy_attribute(
31
- struct _GumboParser* parser, GumboAttribute* attribute);
31
+ struct GumboInternalParser* parser, GumboAttribute* attribute);
32
32
 
33
33
  #ifdef __cplusplus
34
34
  }
@@ -26,7 +26,7 @@
26
26
  #include "utf8.h"
27
27
  #include "util.h"
28
28
 
29
- struct _GumboParser;
29
+ struct GumboInternalParser;
30
30
 
31
31
  const int kGumboNoChar = -1;
32
32
 
@@ -2351,7 +2351,7 @@ static int parse_digit(int c, bool allow_hex) {
2351
2351
  }
2352
2352
 
2353
2353
  static void add_no_digit_error(
2354
- struct _GumboParser* parser, Utf8Iterator* input) {
2354
+ struct GumboInternalParser* parser, Utf8Iterator* input) {
2355
2355
  GumboError* error = gumbo_add_error(parser);
2356
2356
  if (!error) {
2357
2357
  return;
@@ -2361,7 +2361,7 @@ static void add_no_digit_error(
2361
2361
  }
2362
2362
 
2363
2363
  static void add_codepoint_error(
2364
- struct _GumboParser* parser, Utf8Iterator* input,
2364
+ struct GumboInternalParser* parser, Utf8Iterator* input,
2365
2365
  GumboErrorType type, int codepoint) {
2366
2366
  GumboError* error = gumbo_add_error(parser);
2367
2367
  if (!error) {
@@ -2373,7 +2373,7 @@ static void add_codepoint_error(
2373
2373
  }
2374
2374
 
2375
2375
  static void add_named_reference_error(
2376
- struct _GumboParser* parser, Utf8Iterator* input,
2376
+ struct GumboInternalParser* parser, Utf8Iterator* input,
2377
2377
  GumboErrorType type, GumboStringPiece text) {
2378
2378
  GumboError* error = gumbo_add_error(parser);
2379
2379
  if (!error) {
@@ -2394,7 +2394,7 @@ static int maybe_replace_codepoint(int codepoint) {
2394
2394
  }
2395
2395
 
2396
2396
  static bool consume_numeric_ref(
2397
- struct _GumboParser* parser, Utf8Iterator* input, int* output) {
2397
+ struct GumboInternalParser* parser, Utf8Iterator* input, int* output) {
2398
2398
  utf8iterator_next(input);
2399
2399
  bool is_hex = false;
2400
2400
  int c = utf8iterator_current(input);
@@ -2475,7 +2475,7 @@ static bool is_legal_attribute_char_next(Utf8Iterator* input) {
2475
2475
  }
2476
2476
 
2477
2477
  static bool maybe_add_invalid_named_reference(
2478
- struct _GumboParser* parser, Utf8Iterator* input) {
2478
+ struct GumboInternalParser* parser, Utf8Iterator* input) {
2479
2479
  // The iterator will always be reset in this code path, so we don't need to
2480
2480
  // worry about consuming characters.
2481
2481
  const char* start = utf8iterator_get_char_pointer(input);
@@ -2498,7 +2498,7 @@ static bool maybe_add_invalid_named_reference(
2498
2498
  }
2499
2499
 
2500
2500
  static bool consume_named_ref(
2501
- struct _GumboParser* parser, Utf8Iterator* input, bool is_in_attribute,
2501
+ struct GumboInternalParser* parser, Utf8Iterator* input, bool is_in_attribute,
2502
2502
  OneOrTwoCodepoints* output) {
2503
2503
  assert(output->first == kGumboNoChar);
2504
2504
  const NamedCharRef* char_ref = find_named_char_ref(input);
@@ -2530,7 +2530,7 @@ static bool consume_named_ref(
2530
2530
  }
2531
2531
 
2532
2532
  bool consume_char_ref(
2533
- struct _GumboParser* parser, struct _Utf8Iterator* input,
2533
+ struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
2534
2534
  int additional_allowed_char, bool is_in_attribute,
2535
2535
  OneOrTwoCodepoints* output) {
2536
2536
  utf8iterator_mark(input);
@@ -27,8 +27,8 @@
27
27
  extern "C" {
28
28
  #endif
29
29
 
30
- struct _GumboParser;
31
- struct _Utf8Iterator;
30
+ struct GumboInternalParser;
31
+ struct GumboInternalUtf8Iterator;
32
32
 
33
33
  // Value that indicates no character was produced.
34
34
  extern const int kGumboNoChar;
@@ -50,7 +50,7 @@ typedef struct {
50
50
  // space for the "additional allowed char" when the spec says "with no
51
51
  // additional allowed char". Returns false on parse error, true otherwise.
52
52
  bool consume_char_ref(
53
- struct _GumboParser* parser, struct _Utf8Iterator* input,
53
+ struct GumboInternalParser* parser, struct GumboInternalUtf8Iterator* input,
54
54
  int additional_allowed_char, bool is_in_attribute,
55
55
  OneOrTwoCodepoints* output);
56
56
 
@@ -30,7 +30,7 @@
30
30
  extern "C" {
31
31
  #endif
32
32
 
33
- struct _GumboParser;
33
+ struct GumboInternalParser;
34
34
 
35
35
  typedef enum {
36
36
  GUMBO_ERR_UTF8_INVALID,
@@ -78,7 +78,7 @@ typedef enum {
78
78
  } GumboErrorType;
79
79
 
80
80
  // Additional data for duplicated attributes.
81
- typedef struct _GumboDuplicateAttrError {
81
+ typedef struct GumboInternalDuplicateAttrError {
82
82
  // The name of the attribute. Owned by this struct.
83
83
  const char* name;
84
84
 
@@ -114,7 +114,7 @@ typedef enum {
114
114
  // Additional data for tokenizer errors.
115
115
  // This records the current state and codepoint encountered - this is usually
116
116
  // enough to reconstruct what went wrong and provide a friendly error message.
117
- typedef struct _GumboTokenizerError {
117
+ typedef struct GumboInternalTokenizerError {
118
118
  // The bad codepoint encountered.
119
119
  int codepoint;
120
120
 
@@ -123,7 +123,7 @@ typedef struct _GumboTokenizerError {
123
123
  } GumboTokenizerError;
124
124
 
125
125
  // Additional data for parse errors.
126
- typedef struct _GumboParserError {
126
+ typedef struct GumboInternalParserError {
127
127
  // The type of input token that resulted in this error.
128
128
  GumboTokenType input_type;
129
129
 
@@ -142,7 +142,7 @@ typedef struct _GumboParserError {
142
142
  // The overall error struct representing an error in decoding/tokenizing/parsing
143
143
  // the HTML. This contains an enumerated type flag, a source position, and then
144
144
  // a union of fields containing data specific to the error.
145
- typedef struct _GumboError {
145
+ typedef struct GumboInternalError {
146
146
  // The type of error.
147
147
  GumboErrorType type;
148
148
 
@@ -176,23 +176,23 @@ typedef struct _GumboError {
176
176
 
177
177
  // Parser state, for GUMBO_ERR_PARSER and
178
178
  // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
179
- struct _GumboParserError parser;
179
+ struct GumboInternalParserError parser;
180
180
  } v;
181
181
  } GumboError;
182
182
 
183
183
  // Adds a new error to the parser's error list, and returns a pointer to it so
184
184
  // that clients can fill out the rest of its fields. May return NULL if we're
185
185
  // already over the max_errors field specified in GumboOptions.
186
- GumboError* gumbo_add_error(struct _GumboParser* parser);
186
+ GumboError* gumbo_add_error(struct GumboInternalParser* parser);
187
187
 
188
188
  // Initializes the errors vector in the parser.
189
- void gumbo_init_errors(struct _GumboParser* errors);
189
+ void gumbo_init_errors(struct GumboInternalParser* errors);
190
190
 
191
191
  // Frees all the errors in the 'errors_' field of the parser.
192
- void gumbo_destroy_errors(struct _GumboParser* errors);
192
+ void gumbo_destroy_errors(struct GumboInternalParser* errors);
193
193
 
194
194
  // Frees the memory used for a single GumboError.
195
- void gumbo_error_destroy(struct _GumboParser* parser, GumboError* error);
195
+ void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
196
196
 
197
197
  // Prints an error to a string. This fills an empty GumboStringBuffer with a
198
198
  // freshly-allocated buffer containing the error message text. The caller is
@@ -200,7 +200,7 @@ void gumbo_error_destroy(struct _GumboParser* parser, GumboError* error);
200
200
  // the allocator specified in the GumboParser config and hence should be freed
201
201
  // by gumbo_parser_deallocate().)
202
202
  void gumbo_error_to_string(
203
- struct _GumboParser* parser, const GumboError* error,
203
+ struct GumboInternalParser* parser, const GumboError* error,
204
204
  GumboStringBuffer* output);
205
205
 
206
206
  // Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
@@ -209,13 +209,13 @@ void gumbo_error_to_string(
209
209
  // allocated with the allocator specified in the GumboParser config and hence
210
210
  // should be freed by gumbo_parser_deallocate().)
211
211
  void gumbo_caret_diagnostic_to_string(
212
- struct _GumboParser* parser, const GumboError* error,
212
+ struct GumboInternalParser* parser, const GumboError* error,
213
213
  const char* source_text, GumboStringBuffer* output);
214
214
 
215
215
  // Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
216
216
  // of writing to a string.
217
217
  void gumbo_print_caret_diagnostic(
218
- struct _GumboParser* parser, const GumboError* error,
218
+ struct GumboInternalParser* parser, const GumboError* error,
219
219
  const char* source_text);
220
220
 
221
221
  #ifdef __cplusplus
@@ -59,7 +59,7 @@ extern "C" {
59
59
  * buffer of bytes), while the column field is often used to reference a
60
60
  * particular column on a printable display, which nowadays is usually UTF-8.
61
61
  */
62
- typedef struct _GumboSourcePosition {
62
+ typedef struct {
63
63
  unsigned int line;
64
64
  unsigned int column;
65
65
  unsigned int offset;
@@ -81,7 +81,7 @@ extern const GumboSourcePosition kGumboEmptySourcePosition;
81
81
  * Clients should assume that it is not NUL-terminated, and should always use
82
82
  * explicit lengths when manipulating them.
83
83
  */
84
- typedef struct _GumboStringPiece {
84
+ typedef struct {
85
85
  /** A pointer to the beginning of the string. NULL iff length == 0. */
86
86
  const char* data;
87
87
 
@@ -116,7 +116,7 @@ bool gumbo_string_equals_ignore_case(
116
116
  * library. Iteration can be done through inspecting the structure directly in
117
117
  * a for-loop.
118
118
  */
119
- typedef struct _GumboVector {
119
+ typedef struct {
120
120
  /** Data elements. This points to a dynamically-allocated array of capacity
121
121
  * elements, each a void* to the element itself.
122
122
  */
@@ -151,7 +151,7 @@ int gumbo_vector_index_of(GumboVector* vector, void* element);
151
151
  * efficiency benefits, by letting the parser work with enums instead of
152
152
  * strings.
153
153
  */
154
- typedef enum _GumboTag {
154
+ typedef enum {
155
155
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#the-root-element
156
156
  GUMBO_TAG_HTML,
157
157
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/semantics.html#document-metadata
@@ -365,7 +365,7 @@ GumboTag gumbo_tag_enum(const char* tagname);
365
365
  * HTML includes special handling for XLink, XML, and XMLNS namespaces on
366
366
  * attributes. Everything else goes in the generatic "NONE" namespace.
367
367
  */
368
- typedef enum _GumboAttributeNamespaceEnum {
368
+ typedef enum {
369
369
  GUMBO_ATTR_NAMESPACE_NONE,
370
370
  GUMBO_ATTR_NAMESPACE_XLINK,
371
371
  GUMBO_ATTR_NAMESPACE_XML,
@@ -377,7 +377,7 @@ typedef enum _GumboAttributeNamespaceEnum {
377
377
  * name-value pair, but also includes information about source locations and
378
378
  * original source text.
379
379
  */
380
- typedef struct _GumboAttribute {
380
+ typedef struct {
381
381
  /**
382
382
  * The namespace for the attribute. This will usually be
383
383
  * GUMBO_ATTR_NAMESPACE_NONE, but some XLink/XMLNS/XML attributes take special
@@ -438,14 +438,13 @@ typedef struct _GumboAttribute {
438
438
  * and return it, or NULL if no such attribute exists. This uses a
439
439
  * case-insensitive match, as HTML is case-insensitive.
440
440
  */
441
- GumboAttribute* gumbo_get_attribute(
442
- const struct _GumboVector* attrs, const char* name);
441
+ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
443
442
 
444
443
  /**
445
444
  * Enum denoting the type of node. This determines the type of the node.v
446
445
  * union.
447
446
  */
448
- typedef enum _GumboNodeType {
447
+ typedef enum {
449
448
  /** Document node. v will be a GumboDocument. */
450
449
  GUMBO_NODE_DOCUMENT,
451
450
  /** Element node. v will be a GumboElement. */
@@ -464,10 +463,10 @@ typedef enum _GumboNodeType {
464
463
  * Forward declaration of GumboNode so it can be used recursively in
465
464
  * GumboNode.parent.
466
465
  */
467
- typedef struct _GumboNode GumboNode;
466
+ typedef struct GumboInternalNode GumboNode;
468
467
 
469
468
  /** http://www.whatwg.org/specs/web-apps/current-work/complete/dom.html#quirks-mode */
470
- typedef enum _GumboQuirksModeEnum {
469
+ typedef enum {
471
470
  GUMBO_DOCTYPE_NO_QUIRKS,
472
471
  GUMBO_DOCTYPE_QUIRKS,
473
472
  GUMBO_DOCTYPE_LIMITED_QUIRKS
@@ -480,7 +479,7 @@ typedef enum _GumboQuirksModeEnum {
480
479
  * <math> tag is in the MathML namespace, and anything else is inside the HTML
481
480
  * namespace. No other namespaces are supported, so this can be an enum only.
482
481
  */
483
- typedef enum _GumboNamespaceEnum {
482
+ typedef enum {
484
483
  GUMBO_NAMESPACE_HTML,
485
484
  GUMBO_NAMESPACE_SVG,
486
485
  GUMBO_NAMESPACE_MATHML
@@ -494,7 +493,7 @@ typedef enum _GumboNamespaceEnum {
494
493
  * may not be allowed by a style guide, or track the prevalence of incorrect or
495
494
  * tricky HTML code.
496
495
  */
497
- typedef enum _GumboParseFlags {
496
+ typedef enum {
498
497
  /**
499
498
  * A normal node - both start and end tags appear in the source, nothing has
500
499
  * been reparented.
@@ -568,7 +567,7 @@ typedef enum _GumboParseFlags {
568
567
  /**
569
568
  * Information specific to document nodes.
570
569
  */
571
- typedef struct _GumboDocument {
570
+ typedef struct {
572
571
  /**
573
572
  * An array of GumboNodes, containing the children of this element. This will
574
573
  * normally consist of the <html> element and any comment nodes found.
@@ -595,7 +594,7 @@ typedef struct _GumboDocument {
595
594
  * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE elements.
596
595
  * This contains just a block of text and its position.
597
596
  */
598
- typedef struct _GumboText {
597
+ typedef struct {
599
598
  /**
600
599
  * The text of this node, after entities have been parsed and decoded. For
601
600
  * comment/cdata nodes, this does not include the comment delimiters.
@@ -619,7 +618,7 @@ typedef struct _GumboText {
619
618
  * The struct used to represent all HTML elements. This contains information
620
619
  * about the tag, attributes, and child nodes.
621
620
  */
622
- typedef struct _GumboElement {
621
+ typedef struct {
623
622
  /**
624
623
  * An array of GumboNodes, containing the children of this element. Pointers
625
624
  * are owned.
@@ -664,7 +663,7 @@ typedef struct _GumboElement {
664
663
  * A supertype for GumboElement and GumboText, so that we can include one
665
664
  * generic type in lists of children and cast as necessary to subtypes.
666
665
  */
667
- struct _GumboNode {
666
+ struct GumboInternalNode {
668
667
  /** The type of node that this is. */
669
668
  GumboNodeType type;
670
669
 
@@ -710,7 +709,7 @@ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
710
709
  * handling, etc.
711
710
  * Use kGumboDefaultOptions for sensible defaults, and only set what you need.
712
711
  */
713
- typedef struct _GumboOptions {
712
+ typedef struct GumboInternalOptions {
714
713
  /** A memory allocator function. Default: malloc. */
715
714
  GumboAllocatorFunction allocator;
716
715
 
@@ -749,7 +748,7 @@ typedef struct _GumboOptions {
749
748
  extern const GumboOptions kGumboDefaultOptions;
750
749
 
751
750
  /** The output struct containing the results of the parse. */
752
- typedef struct _GumboOutput {
751
+ typedef struct GumboInternalOutput {
753
752
  /**
754
753
  * Pointer to the document node. This is a GumboNode of type NODE_DOCUMENT
755
754
  * that contains the entire document as its child.
@@ -779,18 +778,18 @@ typedef struct _GumboOutput {
779
778
  *
780
779
  * This doesn't support buffers longer than 4 gigabytes.
781
780
  */
782
- struct _GumboOutput* gumbo_parse(const char* buffer);
781
+ GumboOutput* gumbo_parse(const char* buffer);
783
782
 
784
783
  /**
785
784
  * Extended version of gumbo_parse that takes an explicit options structure,
786
785
  * buffer, and length.
787
786
  */
788
- struct _GumboOutput* gumbo_parse_with_options(
787
+ GumboOutput* gumbo_parse_with_options(
789
788
  const GumboOptions* options, const char* buffer, size_t buffer_length);
790
789
 
791
790
  /** Release the memory used for the parse tree & parse errors. */
792
791
  void gumbo_destroy_output(
793
- const struct _GumboOptions* options, GumboOutput* output);
792
+ const GumboOptions* options, GumboOutput* output);
794
793
 
795
794
 
796
795
  #ifdef __cplusplus