RubyGems - nokogumbo - Versions diffs - 1.5.0 → 2.0.0.pre.alpha - Mend

nokogumbo 1.5.0 → 2.0.0.pre.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +56 -0
data/README.md +146 -22
data/ext/nokogumbo/extconf.rb +116 -0
data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
data/gumbo-parser/src/ascii.c +33 -0
data/gumbo-parser/src/ascii.h +31 -0
data/gumbo-parser/src/attribute.c +26 -28
data/gumbo-parser/src/attribute.h +3 -23
data/gumbo-parser/src/char_ref.c +135 -2351
data/gumbo-parser/src/char_ref.h +13 -29
data/gumbo-parser/src/error.c +215 -133
data/gumbo-parser/src/error.h +34 -49
data/gumbo-parser/src/foreign_attrs.c +104 -0
data/gumbo-parser/src/gumbo.h +506 -304
data/gumbo-parser/src/insertion_mode.h +4 -28
data/gumbo-parser/src/macros.h +91 -0
data/gumbo-parser/src/parser.c +1989 -1431
data/gumbo-parser/src/parser.h +6 -22
data/gumbo-parser/src/replacement.h +33 -0
data/gumbo-parser/src/string_buffer.c +43 -50
data/gumbo-parser/src/string_buffer.h +24 -40
data/gumbo-parser/src/string_piece.c +39 -39
data/gumbo-parser/src/svg_attrs.c +174 -0
data/gumbo-parser/src/svg_tags.c +137 -0
data/gumbo-parser/src/tag.c +186 -59
data/gumbo-parser/src/tag_lookup.c +382 -0
data/gumbo-parser/src/tag_lookup.h +13 -0
data/gumbo-parser/src/token_type.h +1 -25
data/gumbo-parser/src/tokenizer.c +899 -495
data/gumbo-parser/src/tokenizer.h +37 -37
data/gumbo-parser/src/tokenizer_states.h +6 -22
data/gumbo-parser/src/utf8.c +103 -86
data/gumbo-parser/src/utf8.h +37 -41
data/gumbo-parser/src/util.c +48 -38
data/gumbo-parser/src/util.h +10 -40
data/gumbo-parser/src/vector.c +45 -57
data/gumbo-parser/src/vector.h +17 -39
data/lib/nokogumbo.rb +10 -174
data/lib/nokogumbo/html5.rb +250 -0
data/lib/nokogumbo/html5/document.rb +37 -0
data/lib/nokogumbo/html5/document_fragment.rb +46 -0
data/lib/nokogumbo/version.rb +3 -0
data/lib/nokogumbo/xml/node.rb +57 -0
metadata +32 -19
data/ext/nokogumboc/extconf.rb +0 -60
data/gumbo-parser/src/char_ref.rl +0 -2554
data/gumbo-parser/src/string_piece.h +0 -38
data/gumbo-parser/src/tag.in +0 -150
data/gumbo-parser/src/tag_enum.h +0 -153
data/gumbo-parser/src/tag_gperf.h +0 -105
data/gumbo-parser/src/tag_sizes.h +0 -4
data/gumbo-parser/src/tag_strings.h +0 -153
data/gumbo-parser/visualc/include/strings.h +0 -4
data/test-nokogumbo.rb +0 -190

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
-  data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
+  metadata.gz: e0d434c0749d7922ba8f084c15ed7219ccbf0e07b715368ae846bc38e64aad17
+  data.tar.gz: 2770648e3e9e82d0ffb1877f1c06edc537688cf6a8405bc52dbdf5a6bb69bc1a
 SHA512:
-  metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
-  data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
+  metadata.gz: e6c3de49495bf55ccaa250e2a3275b6796b0f0565da2a930e3333d2a153f2a16312eb77cb28ca3e03c17720127c2ecc27a1f71cfd6acfd15407295c29973e9fb
+  data.tar.gz: e8ce6c80cb2327d2327f03c7e829156c1f0074ba4d6fce2b0d59305b80112b8fd5edc0932fad1fca13cb5f4bb6f2652fe52a2f090110aa76d06e1afbdebc334f

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,56 @@
+# Changelog
+All notable changes to Nokogumbo will be documented in this file.
+The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
+and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+### Added
+- Experimental support for errors (it was supported in 1.5.0 but
+  undocumented).
+- Added proper HTML5 serialization.
+- Added option `:max_tree_depth` to control the maximum parse tree depth.
+### Changed
+- Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
+  Nokogumbo. A system version will not be used.
+- The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
+  `:max_parse_errors` is deprecated and will go away
+- The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
+  `Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
+  rather than `Nokogiri::HTML::Document` and
+  `Nokogiri::HTML::DocumentFragment`.
+- Changed the top-level API to more closely match Nokogiri's while maintaining
+  backwards compatibility. The new APIs are
+  * `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
+  * `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
+  * `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
+  * `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
+  * `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
+  In all cases, `html` can be a string or an `IO` object (something that
+  responds to `#read`). The `url` parameter is entirely for error reporting,
+  as in Nokogiri. The `encoding` parameter only signals what encoding `html`
+  should have on input; the output `Document` or `DocumentFragment` will be in
+  UTF-8. Currently, the only options supported is `:max_errors` which controls
+  the maximum number of reported by `#errors`.
+### Deprecated
+- `:max_parse_errors`; use `:max_errors`
+### Removed
+### Fixed
+- Fixed documents failing to serialize (via `to_html`) if they contain certain
+  `meta` elements that set the `charset`.
+- Documents are now properly marked as UTF-8 after parsing.
+- Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
+  `<!DOCTYPE html>`.
+- Fixed crash when input contains U+0000 NULL bytes and error reporting is
+  enabled.
+### Security
+- The most recent, released version of Gumbo has a [potential security
+  issue](https://github.com/google/gumbo-parser/pull/375) that could result in
+  a cross-site scripting vulnerability. This has been fixed by integrating
+  Gumbo into Nokogumbo.

data/README.md CHANGED Viewed

@@ -1,5 +1,4 @@
-Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
-===========
+# Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
 Nokogumbo provides the ability for a Ruby program to invoke the
 [Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme)
@@ -8,12 +7,11 @@ and to access the result as a
 [![Build Status](https://travis-ci.org/rubys/nokogumbo.svg)](https://travis-ci.org/rubys/nokogumbo)
-Usage
------
+## Usage
 ```ruby
 require 'nokogumbo'
-doc = Nokogiri::HTML5(string)
+doc = Nokogiri.HTML5(string)
 ```
 An experimental _fragment_ method is also provided.  While not HTML5
@@ -32,21 +30,150 @@ require 'nokogumbo'
 doc = Nokogiri::HTML5.get(uri)
 ```
-Example
------
+## Parsing options
+The document and fragment parsing methods,
+- `Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})`
+- `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})`
+- `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})`
+- `Nokogiri::HTML5.fragment(html, encoding = nil, options = {})`
+- `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
+support options that are different from Nokogiri's.
+The two currently supported options are `:max_errors` and `:max_tree_depth`,
+described below.
+### Error reporting
+Nokogumbo contains an experimental parse error reporting facility. By default,
+no parse errors are reported but this can be configured by passing the
+`:max_errors` option to `::parse` or `::fragment`.
 ```ruby
 require 'nokogumbo'
-puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
+doc = Nokogiri::HTML5.parse('Hi there!<body>', max_errors: 10)
+doc.errors.each do |err|
+  puts err
+end
+```
+This prints the following.
+```
+1:1: ERROR: @1:1: The doctype must be the first token in the document.
+Hi there!<body>
+^
+1:10: ERROR: @1:10: That tag isn't allowed here  Currently open tags: html, body..
+Hi there!<body>
+         ^
+```
+Using `max_errors: -1` results in an unlimited number of errors being
+returned.
+The errors returned by `#errors` are instances of
+[`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError).
+### Maximum tree depth
+The maximum depth of the DOM tree parsed by the various parsing methods is
+configurable by the `:max_tree_depth` option. If the depth of the tree would
+exceed this limit, then an
+[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
+This limit (which defaults to `Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400`) can
+be removed by giving the option `max_tree_depth: -1`.
+``` ruby
+html = '<!DOCTYPE html>' + '<div>' * 1000
+doc = Nokogiri.HTML5(html)
+# raises ArgumentError: Document tree depth limit exceeded
+doc = Nokogiri.HTML5(html, max_tree_depth: -1)
 ```
-Use `.to_html` instead of `.to_s` when parsing and serializing multiple times
+## HTML Serialization
+After parsing HTML, it may be serialized using any of the Nokogiri
+[serialization
+methods](https://www.rubydoc.info/gems/nokogiri/Nokogiri/XML/Node). In
+particular, `#serialize`, `#to_html`, and `#to_s` will serialize a given node
+and its children. (This is the equivalent of JavaScript's
+`Element.outerHTML`.) Similarly, `#inner_html` will serialize the children of
+a given node. (This is the equivalent of JavaScript's `Element.innerHTML`.)
+``` ruby
+doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
+puts doc.serialize
+# Prints: <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
+```
+Due to quirks in how HTML is parsed and serialized, it's possible for a DOM
+tree to be serialized and then re-parsed, resulting in a different DOM.
+Mostly, this happens with DOMs produced from invalid HTML. Unfortunately, even
+valid HTML may not survive serialization and re-parsing.
+In particular, a newline at the start of `pre`, `listing`, and `textarea`
+elements is ignored by the parser.
+``` ruby
+doc = Nokogiri::HTML5(<<-EOF)
+<!DOCTYPE html>
+<pre>
+Content</pre>
+EOF
+puts doc.at('/html/body/pre').serialize
+# Prints: <pre>Content</pre>
 ```
+In this case, the original HTML is semantically equivalent to the serialized
+version. If the `pre`, `listing`, or `textarea` content starts with two
+newlines, the first newline will be stripped on the first parse and the second
+newline will be stripped on the second, leading to semantically different
+DOMs. Passing the parameter `preserve_newline: true` will cause two or more
+newlines to be preserved. (A single leading newline will still be removed.)
+``` ruby
+doc = Nokogiri::HTML5(<<-EOF)
+<!DOCTYPE html>
+<listing>
+Content</listing>
+EOF
+puts doc.at('/html/body/listing').serialize(preserve_newline: true)
+# Prints: <listing>
+#
+# Content</listing>
+```
+## Encodings
+Nokogumbo always parses HTML using
+[UTF-8](https://en.wikipedia.org/wiki/UTF-8); however, the encoding of the
+input can be explicitly selected via the optional `encoding` parameter. This
+is most useful when the input comes not from a string but from an IO object.
+When serializing a document or node, the encoding of the output string can be
+specified via the `:encoding` options. Characters that cannot be encoded in
+the selected encoding will be encoded as [HTML numeric
+entities](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references).
+``` ruby
+frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
+html = frag.serialize(encoding: 'US-ASCII')
+puts html
+# Prints: <span>&#xc544;&#xb294; &#xae38;&#xb3c4; &#xbb3c;&#xc5b4;&#xac00;&#xb77c;</span>
+frag = Nokogiri::HTML5.fragment(html)
+puts frag.serialize
+# Prints: <span>아는 길도 물어가라</span>
+```
+(There's a [bug](https://bugs.ruby-lang.org/issues/15033) in all current
+versions of Ruby that can cause the entity encoding to fail. Of the mandated
+supported encodings for HTML, the only encoding I'm aware of that has this bug
+is `'ISO-2022-JP'`. I recommend avoiding this encoding.)
+## Examples
+```ruby
 require 'nokogumbo'
-Nokogiri::HTML5.parse(Nokogiri::HTML5.parse('<div></div> a').to_html).to_html
+puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
 ```
-Notes
------
+## Notes
 * The `Nokogiri::HTML5.fragment` function takes a string and parses it
   as a HTML5 document.  The `<html>`, `<head>`, and `<body>` elements are
@@ -74,20 +201,17 @@ rules defined in the HTML5 specification for doing so.
 * Instead of returning `unknown` as the element name for unknown tags, the
 original tag name is returned verbatim.
-* If the Gumbo HTML5 parser is not already installed, the source for the
-parser will be downloaded and compiled into the Gem itself.
-Installation
-============
+# Installation
-    git clone --recursive https://github.com/rubys/nokogumbo.git
+    git clone https://github.com/rubys/nokogumbo.git
     cd nokogumbo
     bundle install
     rake gem
     gem install pkg/nokogumbo*.gem
-Related efforts
-============
+# Related efforts
-* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) - a ruby binding
-for the Gumbo HTML5 parser.
+* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) -- a ruby binding
+  for the Gumbo HTML5 parser.
+* [lua-gumbo](https://gitlab.com/craigbarnes/lua-gumbo) -- a lua binding for
+  the Gumbo HTML5 parser.

data/ext/nokogumbo/extconf.rb ADDED Viewed

@@ -0,0 +1,116 @@
+require 'fileutils'
+require 'mkmf'
+require 'nokogiri'
+$CFLAGS += " -std=c99"
+$LDFLAGS.gsub!('-Wl,--no-undefined', '')
+$warnflags = CONFIG['warnflags'] = '-Wall'
+NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}")
+def download_headers
+  begin
+    require 'yaml'
+    dependencies = YAML.load_file(File.join(NG_SPEC.gem_dir, 'dependencies.yml'))
+    version = dependencies['libxml2']['version']
+    host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
+    path = File.join('ports', host, 'libxml2', version, 'include/libxml2')
+    return path if File.directory?(path)
+    # Make sure we're using the same version Nokogiri uses
+    dep_index = NG_SPEC.dependencies.index { |dep| dep.name == 'mini_portile2' and dep.type == :runtime }
+    return nil if dep_index.nil?
+    requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
+    require 'rubygems'
+    gem 'mini_portile2', requirement
+    require 'mini_portile2'
+    p = MiniPortile::new('libxml2', version).tap do |r|
+      r.host = RbConfig::CONFIG["host_alias"].empty? ? RbConfig::CONFIG["host"] : RbConfig::CONFIG["host_alias"]
+      r.files = [{
+        url: "http://xmlsoft.org/sources/libxml2-#{r.version}.tar.gz",
+        sha256: dependencies['libxml2']['sha256']
+      }]
+      r.configure_options += [
+        "--without-python",
+        "--without-readline",
+        "--with-c14n",
+        "--with-debug",
+        "--with-threads"
+      ]
+    end
+    p.download unless p.downloaded?
+    p.extract
+    p.configure unless p.configured?
+    system('make', '-C', "tmp/#{p.host}/ports/libxml2/#{version}/libxml2-#{version}/include/libxml", 'install-xmlincHEADERS')
+    path
+  rescue
+    puts 'failed to download/install headers'
+    nil
+  end
+end
+required = arg_config('--with-libxml2')
+prohibited = arg_config('--without-libxml2')
+if required and prohibited
+  abort "cannot use both --with-libxml2 and --without-libxml2"
+end
+have_libxml2 = false
+have_ng = false
+if !prohibited
+  if Nokogiri::VERSION_INFO.include?('libxml') and
+     Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged'
+    # Nokogiri has libxml2 built in. Find the headers.
+    libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'],
+                             'include/libxml2')
+    if find_header('libxml/tree.h', libxml2_path)
+      have_libxml2 = true
+    else
+      # Unfortunately, some versions of Nokogiri delete these files.
+      # https://github.com/sparklemotion/nokogiri/pull/1788
+      # Try to download them
+      libxml2_path = download_headers
+      unless libxml2_path.nil?
+        have_libxml2 = find_header('libxml/tree.h', libxml2_path)
+      end
+    end
+  else
+    # Nokogiri is compiled with system headers.
+    # Hack to work around broken mkmf on macOS
+    # (https://bugs.ruby-lang.org/issues/14992 fixed now)
+    if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
+      RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
+    end
+    pkg_config('libxml-2.0')
+    have_libxml2 = have_library('xml2', 'xmlNewDoc')
+  end
+  if required and !have_libxml2
+    abort "libxml2 required but could not be located"
+  end
+  if have_libxml2
+    # Find nokogiri.h
+    have_ng = find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
+  end
+end
+if have_libxml2 and have_ng
+  $CFLAGS += " -DNGLIB=1"
+end
+# Symlink gumbo-parser source files.
+ext_dir = File.dirname(__FILE__)
+gumbo_src = File.join(ext_dir, 'gumbo_src')
+Dir.chdir(ext_dir) do
+  $srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
+end
+$INCFLAGS << ' -I$(srcdir)/../../gumbo-parser/src'
+$VPATH << '$(srcdir)/../../gumbo-parser/src'
+create_makefile('nokogumbo/nokogumbo')
+# vim: set sw=2 sts=2 ts=8 et:

data/ext/{nokogumboc → nokogumbo}/nokogumbo.c RENAMED Viewed

@@ -2,7 +2,7 @@
 // nokogumbo.c defines the following:
 //
 //   class Nokogumbo
-//     def parse(utf8_string) # returns Nokogiri::HTML::Document
+//     def parse(utf8_string) # returns Nokogiri::HTML5::Document
 //   end
 //
 // Processing starts by calling gumbo_parse_with_options.  The resulting
@@ -18,26 +18,29 @@
 //    methods are called instead, producing the equivalent functionality.
 //
+#include <assert.h>
 #include <ruby.h>
 #include "gumbo.h"
 #include "error.h"
-#include "parser.h"
 // class constants
 static VALUE Document;
-static VALUE XMLSyntaxError;
 #ifdef NGLIB
 #include <nokogiri.h>
+#include <xml_syntax_error.h>
 #include <libxml/tree.h>
+#include <libxml/HTMLtree.h>
 #define NIL NULL
 #define CONST_CAST (xmlChar const*)
 #else
-#define NIL 0
+#define NIL Qnil
 #define CONST_CAST
 // more class constants
+static VALUE cNokogiriXmlSyntaxError;
 static VALUE Element;
 static VALUE Text;
 static VALUE CDATA;
@@ -45,11 +48,15 @@ static VALUE Comment;
 // interned symbols
 static VALUE new;
+static VALUE attribute;
 static VALUE set_attribute;
+static VALUE remove_attribute;
 static VALUE add_child;
 static VALUE internal_subset;
 static VALUE remove_;
 static VALUE create_internal_subset;
+static VALUE key_;
+static VALUE node_name_;
 // map libxml2 types to Ruby VALUE
 #define xmlNodePtr VALUE
@@ -58,12 +65,10 @@ static VALUE create_internal_subset;
 // redefine libxml2 API as Ruby function calls
 #define xmlNewDocNode(doc, ns, name, content) \
   rb_funcall(Element, new, 2, rb_str_new2(name), doc)
-#define xmlNewProp(element, name, value) \
-  rb_funcall(element, set_attribute, 2, rb_str_new2(name), rb_str_new2(value))
 #define xmlNewDocText(doc, text) \
   rb_funcall(Text, new, 2, rb_str_new2(text), doc)
 #define xmlNewCDataBlock(doc, content, length) \
-  rb_funcall(CDATA, new, 2, rb_str_new(content, length), doc)
+  rb_funcall(CDATA, new, 2, doc, rb_str_new(content, length))
 #define xmlNewDocComment(doc, text) \
   rb_funcall(Comment, new, 2, doc, rb_str_new2(text))
 #define xmlAddChild(element, node) \
@@ -77,11 +82,76 @@ static VALUE create_internal_subset;
 #define Nokogiri_wrap_xml_document(klass, doc) \
   doc
-// remove internal subset from newly created documents
-static VALUE xmlNewDoc(char* version) {
-  VALUE doc = rb_funcall(Document, new, 0);
-  rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
-  return doc;
+static VALUE find_dummy_key(VALUE collection) {
+  VALUE r_dummy = Qnil;
+  char dummy[5] = "a";
+  size_t len = 1;
+  while (len < sizeof dummy) {
+    r_dummy = rb_str_new(dummy, len);
+    if (rb_funcall(collection, key_, 1, r_dummy) == Qfalse)
+      return r_dummy;
+    for (size_t i = 0; ; ++i) {
+      if (dummy[i] == 0) {
+        dummy[i] = 'a';
+        ++len;
+        break;
+      }
+      if (dummy[i] == 'z')
+        dummy[i] = 'a';
+      else {
+        ++dummy[i];
+        break;
+      }
+    }
+  }
+  // This collection has 475254 elements?? Give up.
+  return Qnil;
+}
+static xmlNodePtr xmlNewProp(xmlNodePtr node, const char *name, const char *value) {
+  // Nokogiri::XML::Node#set_attribute calls xmlSetProp(node, name, value)
+  // which behaves roughly as
+  // if name is a QName prefix:local
+  //   if node->doc has a namespace ns corresponding to prefix
+  //     return xmlSetNsProp(node, ns, local, value)
+  // return xmlSetNsProp(node, NULL, name, value)
+  //
+  // If the prefix is "xml", then the namespace lookup will create it.
+  //
+  // By contrast, xmlNewProp does not do this parsing and creates an attribute
+  // with the name and value exactly as given. This is the behavior that we
+  // want.
+  //
+  // Thus, for attribute names like "xml:lang", #set_attribute will create an
+  // attribute with namespace "xml" and name "lang". This is incorrect for
+  // html elements (but correct for foreign elements).
+  //
+  // Work around this by inserting a dummy attribute and then changing the
+  // name, if needed.
+  // Can't use strchr since it's locale-sensitive.
+  size_t len = strlen(name);
+  VALUE r_name = rb_str_new(name, len);
+  if (memchr(name, ':', len) == NULL) {
+    // No colon.
+    return rb_funcall(node, set_attribute, 2, r_name, rb_str_new2(value));
+  }
+  // Find a dummy attribute string that doesn't already exist.
+  VALUE dummy = find_dummy_key(node);
+  if (dummy == Qnil)
+    return Qnil;
+  // Add the dummy attribute.
+  VALUE r_value = rb_funcall(node, set_attribute, 2, dummy, rb_str_new2(value));
+  if (r_value == Qnil)
+    return Qnil;
+  // Remove thet old attribute, if it exists.
+  rb_funcall(node, remove_attribute, 1, r_name);
+  // Rename the dummy
+  VALUE attr = rb_funcall(node, attribute, 1, dummy);
+  if (attr == Qnil)
+    return Qnil;
+  rb_funcall(attr, node_name_, 1, r_name);
+  return attr;
 }
 #endif
@@ -90,30 +160,15 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
 // Build a xmlNodePtr for a given GumboElement (recursively)
 static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
-  // determine tag name for a given node
-  xmlNodePtr element;
-  if (node->tag != GUMBO_TAG_UNKNOWN) {
-    element = xmlNewDocNode(document, NIL,
-      CONST_CAST gumbo_normalized_tagname(node->tag), NIL);
-  } else {
-    GumboStringPiece tag = node->original_tag;
-    gumbo_tag_from_original_text(&tag);
-#ifdef _MSC_VER
-    char* name = alloca(tag.length+1);
-#else
-    char name[tag.length+1];
-#endif
-    strncpy(name, tag.data, tag.length);
-    name[tag.length] = '\0';
-    element = xmlNewDocNode(document, NIL, CONST_CAST name, NIL);
-  }
+  // create the given element
+  xmlNodePtr element = xmlNewDocNode(document, NIL, CONST_CAST node->name, NIL);
   // add in the attributes
   GumboVector* attrs = &node->attributes;
   char *name = NULL;
-  int namelen = 0;
-  char *ns;
-  for (int i=0; i < attrs->length; i++) {
+  size_t namelen = 0;
+  const char *ns;
+  for (size_t i=0; i < attrs->length; i++) {
     GumboAttribute *attr = attrs->data[i];
     switch (attr->attr_namespace) {
@@ -156,7 +211,7 @@ static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
   // add in the children
   GumboVector* children = &node->children;
-  for (int i=0; i < children->length; i++) {
+  for (size_t i=0; i < children->length; i++) {
     xmlNodePtr node = walk_tree(document, children->data[i]);
     if (node) xmlAddChild(element, node);
   }
@@ -176,37 +231,89 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
       return xmlNewDocText(document, CONST_CAST node->v.text.text);
     case GUMBO_NODE_CDATA:
       return xmlNewCDataBlock(document,
-        CONST_CAST node->v.text.original_text.data,
-        (int) node->v.text.original_text.length);
+        CONST_CAST node->v.text.text,
+        (int) strlen(node->v.text.text));
     case GUMBO_NODE_COMMENT:
       return xmlNewDocComment(document, CONST_CAST node->v.text.text);
   }
 }
+// URI = system id
+// external id = public id
+#if NGLIB
+static htmlDocPtr new_html_doc(const char *dtd_name, const char *system, const char *public)
+{
+  // These two libxml2 functions take the public and system ids in
+  // opposite orders.
+  htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
+  assert(doc);
+  if (dtd_name)
+    xmlCreateIntSubset(doc, CONST_CAST dtd_name, CONST_CAST public, CONST_CAST system);
+  return doc;
+}
+#else
+// remove internal subset from newly created documents
+static VALUE new_html_doc(const char *dtd_name, const char *system, const char *public) {
+  VALUE doc;
+  // If system and public are both NULL, Document#new is going to set default
+  // values for them so we're going to have to remove the internal subset
+  // which seems to leak memory in Nokogiri, so leak as little as possible.
+  if (system == NULL && public == NULL) {
+    doc = rb_funcall(Document, new, 2, /* URI */ Qnil, /* external_id */ rb_str_new("", 0));
+    rb_funcall(rb_funcall(doc, internal_subset, 0), remove_, 0);
+    if (dtd_name) {
+      // We need to create an internal subset now.
+      rb_funcall(doc, create_internal_subset, 3, rb_str_new2(dtd_name), Qnil, Qnil);
+    }
+  } else {
+    assert(dtd_name);
+    // Rather than removing and creating the internal subset as we did above,
+    // just create and then rename one.
+    VALUE r_system = system ? rb_str_new2(system) : Qnil;
+    VALUE r_public = public ? rb_str_new2(public) : Qnil;
+    doc = rb_funcall(Document, new, 2, r_system, r_public);
+    rb_funcall(rb_funcall(doc, internal_subset, 0), node_name_, 1, rb_str_new2(dtd_name));
+  }
+  return doc;
+}
+#endif
 // Parse a string using gumbo_parse into a Nokogiri document
-static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
-  GumboOptions options;
-  memcpy(&options, &kGumboDefaultOptions, sizeof options);
-  options.max_errors = NUM2INT(max_parse_errors);
+static VALUE parse(VALUE self, VALUE string, VALUE url, VALUE max_errors, VALUE max_depth) {
+  GumboOptions options = kGumboDefaultOptions;
+  options.max_errors = NUM2INT(max_errors);
+  options.max_tree_depth = NUM2INT(max_depth);
   const char *input = RSTRING_PTR(string);
   size_t input_len = RSTRING_LEN(string);
   GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
-  xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
-#ifdef NGLIB
-  doc->type = XML_HTML_DOCUMENT_NODE;
-#endif
+  const char *status_string = gumbo_status_to_string(output->status);
+  switch (output->status) {
+  case GUMBO_STATUS_OK:
+    break;
+  case GUMBO_STATUS_TREE_TOO_DEEP:
+    gumbo_destroy_output(output);
+    rb_raise(rb_eArgError, "%s", status_string);
+  case GUMBO_STATUS_OUT_OF_MEMORY:
+    gumbo_destroy_output(output);
+    rb_raise(rb_eNoMemError, "%s", status_string);
+  }
+  xmlDocPtr doc;
   if (output->document->v.document.has_doctype) {
     const char *name   = output->document->v.document.name;
     const char *public = output->document->v.document.public_identifier;
     const char *system = output->document->v.document.system_identifier;
-    xmlCreateIntSubset(doc, CONST_CAST name,
-      (public[0] ? CONST_CAST public : NIL),
-      (system[0] ? CONST_CAST system : NIL));
+    public = public[0] ? public : NULL;
+    system = system[0] ? system : NULL;
+    doc = new_html_doc(name, system, public);
+  } else {
+    doc = new_html_doc(NULL, NULL, NULL);
   }
   GumboVector *children = &output->document->v.document.children;
-  for (int i=0; i < children->length; i++) {
+  for (size_t i=0; i < children->length; i++) {
     GumboNode *child = children->data[i];
     xmlNodePtr node = walk_tree(doc, child);
     if (node) {
@@ -222,28 +329,20 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
   // Add parse errors to rdoc.
   if (output->errors.length) {
     GumboVector *errors = &output->errors;
-    GumboParser parser = { ._options = &options };
     GumboStringBuffer msg;
     VALUE rerrors = rb_ary_new2(errors->length);
-    gumbo_string_buffer_init(&parser, &msg);
-    for (int i=0; i < errors->length; i++) {
+    gumbo_string_buffer_init(&msg);
+    for (size_t i=0; i < errors->length; i++) {
       GumboError *err = errors->data[i];
-      gumbo_string_buffer_clear(&parser, &msg);
-      // Work around bug in gumbo_caret_diagnostic_to_string.
-      // See https://github.com/google/gumbo-parser/pull/371
-      // The bug occurs when the error starts with a newline (unless it's the
-      // first character in the input--but that shouldn't cause an error in
-      // the first place.
-      if (*err->original_text == '\n' && err->original_text != input)
-        --err->original_text;
-      gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
+      gumbo_string_buffer_clear(&msg);
+      gumbo_caret_diagnostic_to_string(err, input, input_len, &msg);
       VALUE err_str = rb_str_new(msg.data, msg.length);
-      VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError);
+      VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
       rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
       rb_iv_set(syntax_error, "@code", INT2NUM(1));   // XML_ERR_INTERNAL_ERROR
       rb_iv_set(syntax_error, "@level", INT2NUM(2));  // XML_ERR_ERROR
-      rb_iv_set(syntax_error, "@file", Qnil);
+      rb_iv_set(syntax_error, "@file", url);
       rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
       rb_iv_set(syntax_error, "@str1", Qnil);
       rb_iv_set(syntax_error, "@str2", Qnil);
@@ -253,28 +352,28 @@ static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
       rb_ary_push(rerrors, syntax_error);
     }
     rb_iv_set(rdoc, "@errors", rerrors);
-    gumbo_string_buffer_destroy(&parser, &msg);
+    gumbo_string_buffer_destroy(&msg);
   }
-  gumbo_destroy_output(&options, output);
+  gumbo_destroy_output(output);
   return rdoc;
 }
 // Initialize the Nokogumbo class and fetch constants we will use later
-void Init_nokogumboc() {
+void Init_nokogumbo() {
   rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
   rb_require("nokogiri");
   // class constants
   VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
-  VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
-  Document = rb_const_get(HTML, rb_intern("Document"));
-  VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
-  XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
+  VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
+  Document = rb_const_get(HTML5, rb_intern("Document"));
 #ifndef NGLIB
   // more class constants
+  VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
+  cNokogiriXmlSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
   Element = rb_const_get(XML, rb_intern("Element"));
   Text = rb_const_get(XML, rb_intern("Text"));
   CDATA = rb_const_get(XML, rb_intern("CDATA"));
@@ -282,14 +381,18 @@ void Init_nokogumboc() {
   // interned symbols
   new = rb_intern("new");
+  attribute = rb_intern("attribute");
   set_attribute = rb_intern("set_attribute");
+  remove_attribute = rb_intern("remove_attribute");
   add_child = rb_intern("add_child_node_and_reparent_attrs");
   internal_subset = rb_intern("internal_subset");
   remove_ = rb_intern("remove");
   create_internal_subset = rb_intern("create_internal_subset");
+  key_ = rb_intern("key?");
+  node_name_ = rb_intern("node_name=");
 #endif
-  // define Nokogumbo class with a singleton parse method
-  VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
-  rb_define_singleton_method(Gumbo, "parse", parse, 2);
+  // define Nokogumbo module with a parse method
+  VALUE Gumbo = rb_define_module("Nokogumbo");
+  rb_define_singleton_method(Gumbo, "parse", parse, 4);
 }